Total coverage: 103212 (6%)of 1918407
5 11 11 3 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 /* * net/tipc/monitor.c * * Copyright (c) 2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/genetlink.h> #include "core.h" #include "addr.h" #include "monitor.h" #include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 #define MAX_PEER_DOWN_EVENTS 4 /* struct tipc_mon_domain: domain record to be transferred between peers * @len: actual size of domain record * @gen: current generation of sender's domain * @ack_gen: most recent generation of self's domain acked by peer * @member_cnt: number of domain member nodes described in this record * @up_map: bit map indicating which of the members the sender considers up * @members: identity of the domain members */ struct tipc_mon_domain { u16 len; u16 gen; u16 ack_gen; u16 member_cnt; u64 up_map; u32 members[MAX_MON_DOMAIN]; }; /* struct tipc_peer: state of a peer node and its domain * @addr: tipc node identity of peer * @head_map: shows which other nodes currently consider peer 'up' * @domain: most recent domain record from peer * @hash: position in hashed lookup list * @list: position in linked list, in circular ascending order by 'addr' * @applied: number of reported domain members applied on this monitor list * @is_up: peer is up as seen from this node * @is_head: peer is assigned domain head as seen from this node * @is_local: peer is in local domain and should be continuously monitored * @down_cnt: - numbers of other peers which have reported this on lost */ struct tipc_peer { u32 addr; struct tipc_mon_domain *domain; struct hlist_node hash; struct list_head list; u8 applied; u8 down_cnt; bool is_up; bool is_head; bool is_local; }; struct tipc_monitor { struct hlist_head peers[NODE_HTABLE_SIZE]; int peer_cnt; struct tipc_peer *self; rwlock_t lock; struct tipc_mon_domain cache; u16 list_gen; u16 dom_gen; struct net *net; struct timer_list timer; unsigned long timer_intv; }; static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) { return tipc_net(net)->monitors[bearer_id]; } const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); static inline u16 mon_cpu_to_le16(u16 val) { return (__force __u16)htons(val); } static inline u32 mon_cpu_to_le32(u32 val) { return (__force __u32)htonl(val); } static inline u64 mon_cpu_to_le64(u64 val) { return (__force __u64)cpu_to_be64(val); } static inline u16 mon_le16_to_cpu(u16 val) { return ntohs((__force __be16)val); } static inline u32 mon_le32_to_cpu(u32 val) { return ntohl((__force __be32)val); } static inline u64 mon_le64_to_cpu(u64 val) { return be64_to_cpu((__force __be64)val); } /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers */ static int dom_size(int peers) { int i = 0; while ((i * i) < peers) i++; return min(i, MAX_MON_DOMAIN); } static void map_set(u64 *up_map, int i, unsigned int v) { *up_map &= ~(1ULL << i); *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) { return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) { return list_last_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_nxt(struct tipc_peer *peer) { return list_first_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_head(struct tipc_peer *peer) { while (!peer->is_head) peer = peer_prev(peer); return peer; } static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) { struct tipc_peer *peer; unsigned int thash = tipc_hashfn(addr); hlist_for_each_entry(peer, &mon->peers[thash], hash) { if (peer->addr == addr) return peer; } return NULL; } static struct tipc_peer *get_self(struct net *net, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); return mon->self; } static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) { struct tipc_net *tn = tipc_net(net); return mon->peer_cnt > tn->mon_threshold; } /* mon_identify_lost_members() : - identify amd mark potentially lost members */ static void mon_identify_lost_members(struct tipc_peer *peer, struct tipc_mon_domain *dom_bef, int applied_bef) { struct tipc_peer *member = peer; struct tipc_mon_domain *dom_aft = peer->domain; int applied_aft = peer->applied; int i; for (i = 0; i < applied_bef; i++) { member = peer_nxt(member); /* Do nothing if self or peer already see member as down */ if (!member->is_up || !map_get(dom_bef->up_map, i)) continue; /* Loss of local node must be detected by active probing */ if (member->is_local) continue; /* Start probing if member was removed from applied domain */ if (!applied_aft || (applied_aft < i)) { member->down_cnt = 1; continue; } /* Member loss is confirmed if it is still in applied domain */ if (!map_get(dom_aft->up_map, i)) member->down_cnt++; } } /* mon_apply_domain() : match a peer's domain record against monitor list */ static void mon_apply_domain(struct tipc_monitor *mon, struct tipc_peer *peer) { struct tipc_mon_domain *dom = peer->domain; struct tipc_peer *member; u32 addr; int i; if (!dom || !peer->is_up) return; /* Scan across domain members and match against monitor list */ peer->applied = 0; member = peer_nxt(peer); for (i = 0; i < dom->member_cnt; i++) { addr = dom->members[i]; if (addr != member->addr) return; peer->applied++; member = peer_nxt(member); } } /* mon_update_local_domain() : update after peer addition/removal/up/down */ static void mon_update_local_domain(struct tipc_monitor *mon) { struct tipc_peer *self = mon->self; struct tipc_mon_domain *cache = &mon->cache; struct tipc_mon_domain *dom = self->domain; struct tipc_peer *peer = self; u64 prev_up_map = dom->up_map; u16 member_cnt, i; bool diff; /* Update local domain size based on current size of cluster */ member_cnt = dom_size(mon->peer_cnt) - 1; self->applied = member_cnt; /* Update native and cached outgoing local domain records */ dom->len = dom_rec_len(dom, member_cnt); diff = dom->member_cnt != member_cnt; dom->member_cnt = member_cnt; for (i = 0; i < member_cnt; i++) { peer = peer_nxt(peer); diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; cache->len = mon_cpu_to_le16(dom->len); cache->gen = mon_cpu_to_le16(dom->gen); cache->member_cnt = mon_cpu_to_le16(member_cnt); cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } /* mon_update_neighbors() : update preceding neighbors of added/removed peer */ static void mon_update_neighbors(struct tipc_monitor *mon, struct tipc_peer *peer) { int dz, i; dz = dom_size(mon->peer_cnt); for (i = 0; i < dz; i++) { mon_apply_domain(mon, peer); peer = peer_prev(peer); } } /* mon_assign_roles() : reassign peer roles after a network change * The monitor list is consistent at this stage; i.e., each peer is monitoring * a set of domain members as matched between domain record and the monitor list */ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) { struct tipc_peer *peer = peer_nxt(head); struct tipc_peer *self = mon->self; int i = 0; for (; peer != self; peer = peer_nxt(peer)) { peer->is_local = false; /* Update domain member */ if (i++ < head->applied) { peer->is_head = false; if (head == self) peer->is_local = true; continue; } /* Assign next domain head */ if (!peer->is_up) continue; if (peer->is_head) break; head = peer; head->is_head = true; i = 0; } mon->list_gen++; } void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *prev, *head; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) goto exit; prev = peer_prev(peer); list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); mon->peer_cnt--; head = peer_head(prev); if (head == self) mon_update_local_domain(mon); mon_update_neighbors(mon, prev); /* Revert to full-mesh monitoring if we reach threshold */ if (!tipc_mon_is_active(net, mon)) { list_for_each_entry(peer, &self->list, list) { kfree(peer->domain); peer->domain = NULL; peer->applied = 0; } } mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer **peer) { struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; p = kzalloc_obj(*p, GFP_ATOMIC); *peer = p; if (!p) return false; p->addr = addr; /* Add new peer to lookup list */ INIT_LIST_HEAD(&p->list); hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); /* Sort new peer into iterator list, in ascending circular order */ prev = self; list_for_each_entry(cur, &self->list, list) { if ((addr > prev->addr) && (addr < cur->addr)) break; if (((addr < cur->addr) || (addr > prev->addr)) && (prev->addr > cur->addr)) break; prev = cur; } list_add_tail(&p->list, &cur->list); mon->peer_cnt++; mon_update_neighbors(mon, p); return true; } void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self = get_self(net, bearer_id); struct tipc_peer *peer, *head; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) goto exit; peer->is_up = true; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *head; struct tipc_mon_domain *dom; int applied; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) { pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); goto exit; } applied = peer->applied; peer->applied = 0; dom = peer->domain; peer->domain = NULL; if (peer->is_head) mon_identify_lost_members(peer, dom, applied); kfree(dom); peer->is_up = false; peer->is_head = false; peer->is_local = false; peer->down_cnt = 0; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } /* tipc_mon_rcv - process monitor domain event message */ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *arrv_dom = data; struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; state->probing = false; /* Sanity check received domain record */ if (new_member_cnt > MAX_MON_DOMAIN) return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ if (!state->synched) { state->peer_gen = new_gen - 1; state->acked_gen = acked_gen; state->synched = true; } if (more(acked_gen, state->acked_gen)) state->acked_gen = acked_gen; /* Drop duplicate unless we are waiting for a probe response */ if (!more(new_gen, state->peer_gen) && !probing) return; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer || !peer->is_up) goto exit; /* Peer is confirmed, stop any ongoing probing */ peer->down_cnt = 0; /* Task is done for duplicate record */ if (!more(new_gen, state->peer_gen)) goto exit; state->peer_gen = new_gen; /* Cache current domain record for later use */ dom_bef.member_cnt = 0; dom = peer->domain; if (dom) memcpy(&dom_bef, dom, dom->len); /* Transform and store received domain record */ if (!dom || (dom->len < new_dlen)) { kfree(dom); dom = kmalloc(new_dlen, GFP_ATOMIC); peer->domain = dom; if (!dom) goto exit; } dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; mon_apply_domain(mon, peer); mon_identify_lost_members(peer, &dom_bef, applied_bef); mon_assign_roles(mon, peer_head(peer)); exit: write_unlock_bh(&mon->lock); } void tipc_mon_prep(struct net *net, void *data, int *dlen, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *dom = data; u16 gen = mon->dom_gen; u16 len; /* Send invalid record if not active */ if (!tipc_mon_is_active(net, mon)) { dom->len = 0; return; } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; dom->len = mon_cpu_to_le16(len); dom->gen = mon_cpu_to_le16(gen); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!tipc_mon_is_active(net, mon)) { state->probing = false; state->monitoring = true; return; } /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && (state->acked_gen == mon->dom_gen)) return; read_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (peer) { state->probing = state->acked_gen != mon->dom_gen; state->probing |= peer->down_cnt; state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; state->monitoring = peer->is_local; state->monitoring |= peer->is_head; state->list_gen = mon->list_gen; } read_unlock_bh(&mon->lock); } static void mon_timeout(struct timer_list *t) { struct tipc_monitor *mon = timer_container_of(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; write_lock_bh(&mon->lock); self = mon->self; if (self && (best_member_cnt != self->applied)) { mon_update_local_domain(mon); mon_assign_roles(mon, self); } write_unlock_bh(&mon->lock); mod_timer(&mon->timer, jiffies + mon->timer_intv); } int tipc_mon_create(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon; struct tipc_peer *self; struct tipc_mon_domain *dom; if (tn->monitors[bearer_id]) return 0; mon = kzalloc_obj(*mon, GFP_ATOMIC); self = kzalloc_obj(*self, GFP_ATOMIC); dom = kzalloc_obj(*dom, GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); kfree(dom); return -ENOMEM; } tn->monitors[bearer_id] = mon; rwlock_init(&mon->lock); mon->net = net; mon->peer_cnt = 1; mon->self = self; self->domain = dom; self->addr = tipc_own_addr(net); self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; } void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *tmp; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); } mon->self = NULL; write_unlock_bh(&mon->lock); timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); } void tipc_mon_reinit_self(struct net *net) { struct tipc_monitor *mon; int bearer_id; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { mon = tipc_monitor(net, bearer_id); if (!mon) continue; write_lock_bh(&mon->lock); if (mon->self) mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); if (cluster_size > TIPC_CLUSTER_SIZE) return -EINVAL; tn->mon_threshold = cluster_size; return 0; } int tipc_nl_monitor_get_threshold(struct net *net) { struct tipc_net *tn = tipc_net(net); return tn->mon_threshold; } static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_PEER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) goto attr_msg_full; if (peer->is_up) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) goto attr_msg_full; if (peer->is_local) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) goto attr_msg_full; if (peer->is_head) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) goto attr_msg_full; if (dom) { if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, dom->up_map, TIPC_NLA_MON_PEER_PAD)) goto attr_msg_full; if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, dom->member_cnt * sizeof(u32), &dom->members)) goto attr_msg_full; } nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!mon) return -EINVAL; read_lock_bh(&mon->lock); peer = mon->self; do { if (*prev_node) { if (peer->addr == *prev_node) *prev_node = 0; else continue; } if (__tipc_nl_add_monitor_peer(peer, msg)) { *prev_node = peer->addr; read_unlock_bh(&mon->lock); return -EMSGSIZE; } } while ((peer = peer_nxt(peer)) != mon->self); read_unlock_bh(&mon->lock); return 0; } int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); char bearer_name[TIPC_MAX_BEARER_NAME]; struct nlattr *attrs; void *hdr; int ret; ret = tipc_bearer_get_name(net, bearer_name, bearer_id); if (ret || !mon) return 0; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; read_lock_bh(&mon->lock); if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) goto attr_msg_full; if (tipc_mon_is_active(net, mon)) if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) goto attr_msg_full; read_unlock_bh(&mon->lock); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; }
2713 2712 947 911 12 75 2016 492 493 49 2459 32 285 2459 40 1547 40 1514 12 6 6 6 6 6 71 762 760 71 12 12 160 296 295 20 1153 672 1050 14 14 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_NOTIFY_H #define _LINUX_FS_NOTIFY_H /* * include/linux/fsnotify.h - generic hooks for filesystem notification, to * reduce in-source duplication from both dnotify and inotify. * * We don't compile any of this away in some complicated menagerie of ifdefs. * Instead, we rely on the code inside to optimize away as needed. * * (C) Copyright 2005 Robert Love */ #include <linux/fsnotify_backend.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/bug.h> /* Are there any inode/mount/sb objects watched with priority prio or above? */ static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb, int prio) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return false; return atomic_long_read(&sbinfo->watched_objects[prio]); } /* Are there any inode/mount/sb objects that are being watched at all? */ static inline bool fsnotify_sb_has_watchers(struct super_block *sb) { return fsnotify_sb_has_priority_watchers(sb, 0); } /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may * have changed (i.e. renamed over). * * Unlike fsnotify_parent(), the event will be reported regardless of the * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) { if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0); } /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { struct inode *inode = d_inode(dentry); if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { mask |= FS_ISDIR; /* sb/mount marks are not interested in name of directory */ if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) goto notify_child; } /* disconnected dentry cannot notify parent */ if (IS_ROOT(dentry)) goto notify_child; return __fsnotify_parent(dentry, mask, data, data_type); notify_child: return fsnotify(mask, data, data_type, NULL, NULL, inode, 0); } /* * Simple wrappers to consolidate calls to fsnotify_parent() when an event * is on a file/dentry. */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_path(const struct path *path, __u32 mask) { return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } static inline int fsnotify_file(struct file *file, __u32 mask) { /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; return fsnotify_path(&file->f_path, mask); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS int fsnotify_open_perm_and_set_mode(struct file *file); /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection * must not be held. */ lockdep_assert_once(file_write_not_started(file)); if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; /* * read()/write() and other types of access generate pre-content events. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { int ret = fsnotify_pre_content(&file->f_path, ppos, count); if (ret) return ret; } if (!(perm_mask & MAY_READ) || likely(!FMODE_FSNOTIFY_ACCESS_PERM(file->f_mode))) return 0; /* * read() also generates the legacy FS_ACCESS_PERM event, so content * scanners can inspect the content filled by pre-content event. */ return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } /* * fsnotify_mmap_perm - permission hook before mmap of file range */ static inline int fsnotify_mmap_perm(struct file *file, int prot, const loff_t off, size_t len) { /* * mmap() generates only pre-content events. */ if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) return 0; return fsnotify_pre_content(&file->f_path, &off, len); } /* * fsnotify_truncate_perm - permission hook before file truncate */ static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { struct inode *inode = d_inode(path->dentry); if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || !fsnotify_sb_has_priority_watchers(inode->i_sb, FSNOTIFY_PRIO_PRE_CONTENT)) return 0; return fsnotify_pre_content(path, &length, 0); } /* * fsnotify_file_perm - permission hook before file access (unknown range) */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return fsnotify_file_area_perm(file, perm_mask, NULL, 0); } #else static inline int fsnotify_open_perm_and_set_mode(struct file *file) { return 0; } static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { return 0; } static inline int fsnotify_mmap_perm(struct file *file, int prot, const loff_t off, size_t len) { return 0; } static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { return 0; } static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; } #endif /* * fsnotify_link_count - inode's link count changed */ static inline void fsnotify_link_count(struct inode *inode) { fsnotify_inode(inode, FS_ATTRIB); } /* * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir */ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, const struct qstr *old_name, int isdir, struct inode *target, struct dentry *moved) { struct inode *source = moved->d_inode; u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; __u32 rename_mask = FS_RENAME; const struct qstr *new_name = &moved->d_name; if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; rename_mask |= FS_ISDIR; } /* Event with information about both old and new parent+name */ fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY, old_dir, old_name, 0); fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_dir, old_name, fs_cookie); fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); fsnotify_inode(source, FS_MOVE_SELF); audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); } /* * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed */ static inline void fsnotify_inode_delete(struct inode *inode) { __fsnotify_inode_delete(inode); } /* * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed */ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) { __fsnotify_vfsmount_delete(mnt); } static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) { __fsnotify_mntns_delete(mntns); } /* * fsnotify_inoderemove - an inode is going away */ static inline void fsnotify_inoderemove(struct inode *inode) { fsnotify_inode(inode, FS_DELETE_SELF); __fsnotify_inode_delete(inode); } /* * fsnotify_create - 'name' was linked in * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory * * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry) { fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, dir, &new_dentry->d_name, 0); } /* * fsnotify_delete - @dentry was unlinked and unhashed * * Caller must make sure that dentry->d_name is stable. * * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode * as this may be called after d_delete() and old_dentry may be negative. */ static inline void fsnotify_delete(struct inode *dir, struct inode *inode, struct dentry *dentry) { __u32 mask = FS_DELETE; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name, 0); } /** * d_delete_notify - delete a dentry and call fsnotify_delete() * @dentry: The dentry to delete * * This helper is used to guaranty that the unlinked inode cannot be found * by lookup of this name after fsnotify_delete() event has been delivered. */ static inline void d_delete_notify(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); ihold(inode); d_delete(dentry); fsnotify_delete(dir, inode, dentry); iput(inode); } /* * fsnotify_unlink - 'name' was unlinked * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_mkdir - directory 'name' was created * * Caller must make sure that dentry->d_name is stable. * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate * ->d_inode later */ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* * fsnotify_rmdir - directory 'name' was removed * * Caller must make sure that dentry->d_name is stable. */ static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry) { if (WARN_ON_ONCE(d_is_negative(dentry))) return; fsnotify_delete(dir, d_inode(dentry), dentry); } /* * fsnotify_access - file was read */ static inline void fsnotify_access(struct file *file) { fsnotify_file(file, FS_ACCESS); } /* * fsnotify_modify - file was modified */ static inline void fsnotify_modify(struct file *file) { fsnotify_file(file, FS_MODIFY); } /* * fsnotify_open - file was opened */ static inline void fsnotify_open(struct file *file) { __u32 mask = FS_OPEN; if (file->f_flags & __FMODE_EXEC) mask |= FS_OPEN_EXEC; fsnotify_file(file, mask); } /* * fsnotify_close - file was closed */ static inline void fsnotify_close(struct file *file) { __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; fsnotify_file(file, mask); } /* * fsnotify_xattr - extended attributes were changed */ static inline void fsnotify_xattr(struct dentry *dentry) { fsnotify_dentry(dentry, FS_ATTRIB); } /* * fsnotify_change - notify_change event. file was modified and/or metadata * was changed. */ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) { __u32 mask = 0; if (ia_valid & ATTR_UID) mask |= FS_ATTRIB; if (ia_valid & ATTR_GID) mask |= FS_ATTRIB; if (ia_valid & ATTR_SIZE) mask |= FS_MODIFY; /* both times implies a utime(s) call */ if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) mask |= FS_ATTRIB; else if (ia_valid & ATTR_ATIME) mask |= FS_ACCESS; else if (ia_valid & ATTR_MTIME) mask |= FS_MODIFY; if (ia_valid & ATTR_MODE) mask |= FS_ATTRIB; if (mask) fsnotify_dentry(dentry, mask); } static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); } static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_DETACH, ns, mnt); } static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) { fsnotify_mnt(FS_MNT_MOVE, ns, mnt); } #endif /* _LINUX_FS_NOTIFY_H */
17 534 22 12 7 92 6 27 27 27 1 1 68 22 12 12 12 12 5 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGEMAP_H #define _LINUX_PAGEMAP_H /* * Copyright 1995 Linus Torvalds */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> struct folio_batch; unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) invalidate_mapping_pages(inode->i_mapping, 0, -1); } int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte); int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end); static inline int filemap_fdatawait(struct address_space *mapping) { return filemap_fdatawait_range(mapping, 0, LLONG_MAX); } bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) { return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); } /** * filemap_set_wb_err - set a writeback error on an address_space * @mapping: mapping in which to set writeback error * @err: error to be set in mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * filemap_set_wb_err to record the error in the mapping so that it will be * automatically reported whenever fsync is called on the file. */ static inline void filemap_set_wb_err(struct address_space *mapping, int err) { /* Fastpath for common case of no error */ if (unlikely(err)) __filemap_set_wb_err(mapping, err); } /** * filemap_check_wb_err - has an error occurred since the mark was sampled? * @mapping: mapping to check for writeback errors * @since: previously-sampled errseq_t * * Grab the errseq_t value from the mapping, and see if it has changed "since" * the given value was sampled. * * If it has then report the latest error set, otherwise return 0. */ static inline int filemap_check_wb_err(struct address_space *mapping, errseq_t since) { return errseq_check(&mapping->wb_err, since); } /** * filemap_sample_wb_err - sample the current errseq_t to test for later errors * @mapping: mapping to be sampled * * Writeback errors are always reported relative to a particular sample point * in the past. This function provides those sample points. */ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) { return errseq_sample(&mapping->wb_err); } /** * file_sample_sb_err - sample the current errseq_t to test for later errors * @file: file pointer to be sampled * * Grab the most current superblock-level errseq_t value for the given * struct file. */ static inline errseq_t file_sample_sb_err(struct file *file) { return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } /* * Flush file data before changing attributes. Caller must hold any locks * required to prevent further writes to this file until we're done setting * flags. */ static inline int inode_drain_writes(struct inode *inode) { inode_dio_wait(inode); return filemap_write_and_wait(inode->i_mapping); } static inline bool mapping_empty(const struct address_space *mapping) { return xa_empty(&mapping->i_pages); } /* * mapping_shrinkable - test if page cache state allows inode reclaim * @mapping: the page cache mapping * * This checks the mapping's cache state for the pupose of inode * reclaim and LRU management. * * The caller is expected to hold the i_lock, but is not required to * hold the i_pages lock, which usually protects cache state. That's * because the i_lock and the list_lru lock that protect the inode and * its LRU state don't nest inside the irq-safe i_pages lock. * * Cache deletions are performed under the i_lock, which ensures that * when an inode goes empty, it will reliably get queued on the LRU. * * Cache additions do not acquire the i_lock and may race with this * check, in which case we'll report the inode as shrinkable when it * has cache pages. This is okay: the shrinker also checks the * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ static inline bool mapping_shrinkable(const struct address_space *mapping) { void *head; /* * On highmem systems, there could be lowmem pressure from the * inodes before there is highmem pressure from the page * cache. Make inodes shrinkable regardless of cache state. */ if (IS_ENABLED(CONFIG_HIGHMEM)) return true; /* Cache completely empty? Shrink away. */ head = rcu_access_pointer(mapping->i_pages.xa_head); if (!head) return true; /* * The xarray stores single offset-0 entries directly in the * head pointer, which allows non-resident page cache entries * to escape the shadow shrinker's list of xarray nodes. The * inode shrinker needs to pick them up under memory pressure. */ if (!xa_is_node(head) && xa_is_value(head)) return true; return false; } /* * Bits in mapping->flags. */ enum mapping_flags { AS_EIO = 0, /* IO error on async write */ AS_ENOSPC = 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, }; #define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) #define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) #define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) #define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set * @error: the error to set in the mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * mapping_set_error to record the error in the mapping so that it can be * reported when the application calls fsync(2). */ static inline void mapping_set_error(struct address_space *mapping, int error) { if (likely(!error)) return; /* Record in wb_err for checkers using errseq_t based tracking */ __filemap_set_wb_err(mapping, error); /* Record it in superblock */ if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else set_bit(AS_EIO, &mapping->flags); } static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_clear_unevictable(struct address_space *mapping) { clear_bit(AS_UNEVICTABLE, &mapping->flags); } static inline bool mapping_unevictable(const struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_set_exiting(struct address_space *mapping) { set_bit(AS_EXITING, &mapping->flags); } static inline int mapping_exiting(const struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } static inline void mapping_set_no_writeback_tags(struct address_space *mapping) { set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline int mapping_use_writeback_tags(const struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline bool mapping_release_always(const struct address_space *mapping) { return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline void mapping_set_release_always(struct address_space *mapping) { set_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline void mapping_clear_release_always(struct address_space *mapping) { clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } static inline bool mapping_stable_writes(const struct address_space *mapping) { return test_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_set_stable_writes(struct address_space *mapping) { set_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_clear_stable_writes(struct address_space *mapping) { clear_bit(AS_STABLE_WRITES, &mapping->flags); } static inline void mapping_set_inaccessible(struct address_space *mapping) { /* * It's expected inaccessible mappings are also unevictable. Compaction * migrate scanner (isolate_migratepages_block()) relies on this to * reduce page locking. */ set_bit(AS_UNEVICTABLE, &mapping->flags); set_bit(AS_INACCESSIBLE, &mapping->flags); } static inline bool mapping_inaccessible(const struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping) { set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } static inline void mapping_set_no_data_integrity(struct address_space *mapping) { set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); } static inline bool mapping_no_data_integrity(const struct address_space *mapping) { return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); } static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { m->gfp_mask = mask; } /* * There are some parts of the kernel which assume that PMD entries * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, * limit the maximum allocation order to PMD size. I'm not aware of any * assumptions about maximum order if THP are disabled, but 8 seems like * a good order (that's 1MB if you're using 4kB pages) */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER #else #define PREFERRED_MAX_PAGECACHE_ORDER 8 #endif /* * xas_split_alloc() does not support arbitrary orders. This implies no * 512MB THP on ARM64 with 64KB base page size. */ #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) /* * mapping_max_folio_size_supported() - Check the max folio size supported * * The filesystem should call this function at mount time if there is a * requirement on the folio mapping size in the page cache. */ static inline size_t mapping_max_folio_size_supported(void) { if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER); return PAGE_SIZE; } /* * mapping_set_folio_order_range() - Set the orders supported by a file. * @mapping: The address space of the file. * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). * * The filesystem should call this function in its inode constructor to * indicate which base size (min) and maximum size (max) of folio the VFS * can use to cache the contents of the file. This should only be used * if the filesystem needs special handling of folio sizes (ie there is * something the core cannot know). * Do not tune it based on, eg, i_size. * * Context: This should not be called while the inode is active as it * is non-atomic. */ static inline void mapping_set_folio_order_range(struct address_space *mapping, unsigned int min, unsigned int max) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; if (min > MAX_PAGECACHE_ORDER) min = MAX_PAGECACHE_ORDER; if (max > MAX_PAGECACHE_ORDER) max = MAX_PAGECACHE_ORDER; if (max < min) max = min; mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); } static inline void mapping_set_folio_min_order(struct address_space *mapping, unsigned int min) { mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); } /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The address space of the file. * * The filesystem should call this function in its inode constructor to * indicate that the VFS can use large folios to cache the contents of * the file. * * Context: This should not be called while the inode is active as it * is non-atomic. */ static inline void mapping_set_large_folios(struct address_space *mapping) { mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); } static inline unsigned int mapping_max_folio_order(const struct address_space *mapping) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 0; return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; } static inline unsigned int mapping_min_folio_order(const struct address_space *mapping) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return 0; return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } static inline unsigned long mapping_min_folio_nrpages(const struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } static inline unsigned long mapping_min_folio_nrbytes(const struct address_space *mapping) { return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT; } /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. * @index: The page index. * * The index of a folio must be naturally aligned. If you are adding a * new folio to the page cache and need to know what index to give it, * call this function. */ static inline pgoff_t mapping_align_index(const struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); } /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ static inline bool mapping_large_folio_support(const struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, "Anonymous mapping always supports large folio"); return mapping_max_folio_order(mapping) > 0; } /* Return the maximum folio size for this pagecache mapping, in bytes. */ static inline size_t mapping_max_folio_size(const struct address_space *mapping) { return PAGE_SIZE << mapping_max_folio_order(mapping); } static inline int filemap_nr_thps(const struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); #else return 0; #endif } static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_large_folio_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); #endif } static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_large_folio_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); #endif } struct address_space *folio_mapping(const struct folio *folio); /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Anonymous folios return NULL, even if they're in * the swap cache. Other kinds of folio also return NULL. * * This is ONLY used by architecture cache flushing code. If you aren't * writing cache flushing code, you want either folio_mapping() or * folio_file_mapping(). */ static inline struct address_space *folio_flush_mapping(struct folio *folio) { if (unlikely(folio_test_swapcache(folio))) return NULL; return folio_mapping(folio); } /** * folio_inode - Get the host inode for this folio. * @folio: The folio. * * For folios which are in the page cache, return the inode that this folio * belongs to. * * Do not call this for folios which aren't in the page cache. */ static inline struct inode *folio_inode(struct folio *folio) { return folio->mapping->host; } /** * folio_attach_private - Attach private data to a folio. * @folio: Folio to attach data to. * @data: Data to attach to folio. * * Attaching private data to a folio increments the page's reference count. * The data must be detached before the folio will be freed. */ static inline void folio_attach_private(struct folio *folio, void *data) { folio_get(folio); folio->private = data; folio_set_private(folio); } /** * folio_change_private - Change private data on a folio. * @folio: Folio to change the data on. * @data: Data to set on the folio. * * Change the private data attached to a folio and return the old * data. The page must previously have had data attached and the data * must be detached before the folio will be freed. * * Return: Data that was previously attached to the folio. */ static inline void *folio_change_private(struct folio *folio, void *data) { void *old = folio_get_private(folio); folio->private = data; return old; } /** * folio_detach_private - Detach private data from a folio. * @folio: Folio to detach data from. * * Removes the data that was previously attached to the folio and decrements * the refcount on the page. * * Return: Data that was attached to the folio. */ static inline void *folio_detach_private(struct folio *folio) { void *data = folio_get_private(folio); if (!folio_test_private(folio)) return NULL; folio_clear_private(folio); folio->private = NULL; folio_put(folio); return data; } static inline void attach_page_private(struct page *page, void *data) { folio_attach_private(page_folio(page), data); } static inline void *detach_page_private(struct page *page) { return folio_detach_private(page_folio(page)); } #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, struct mempolicy *policy); #else static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } #endif #define filemap_alloc_folio(...) \ alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__)) static inline struct page *__page_cache_alloc(gfp_t gfp) { return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) { return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(struct file *, struct folio *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); /** * typedef fgf_t - Flags for getting folios from the page cache. * * Most users of the page cache will not need to use these flags; * there are convenience functions such as filemap_get_folio() and * filemap_lock_folio(). For users which need more control over exactly * what is done with the folios, these flags to __filemap_get_folio() * are available. * * * %FGP_ACCESSED - The folio will be marked accessed. * * %FGP_LOCK - The folio is returned locked. * * %FGP_CREAT - If no folio is present then a new folio is allocated, * added to the page cache and the VM's LRU list. The folio is * returned locked. * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the * folio is already in cache. If the folio was allocated, unlock it * before returning so the caller can do the same dance. * * %FGP_WRITE - The folio will be written to by the caller. * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ typedef unsigned int __bitwise fgf_t; #define FGP_ACCESSED ((__force fgf_t)0x00000001) #define FGP_LOCK ((__force fgf_t)0x00000002) #define FGP_CREAT ((__force fgf_t)0x00000004) #define FGP_WRITE ((__force fgf_t)0x00000008) #define FGP_NOFS ((__force fgf_t)0x00000010) #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) #define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) static inline unsigned int filemap_get_order(size_t size) { unsigned int shift = ilog2(size); if (shift <= PAGE_SHIFT) return 0; return shift - PAGE_SHIFT; } /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. * * The caller of __filemap_get_folio() can use this to suggest a preferred * size for the folio that is created. If there is already a folio at * the index, it will be returned, no matter what its size. If a folio * is freshly created, it may be of a different size than requested * due to alignment constraints, memory pressure, or the presence of * other folios at nearby indices. */ static inline fgf_t fgf_set_order(size_t size) { unsigned int order = filemap_get_order(size); if (!order) return 0; return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio_mpol(struct address_space *mapping, pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); static inline struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgf_flags, gfp_t gfp) { return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); } /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). * @mapping: The address space to search. * @index: The page cache index. * @len: Length of data being written. * * This is a helper for filesystem write_begin() implementations. * It wraps __filemap_get_folio(), setting appropriate flags in * the write begin context. * * Return: A folio or an ERR_PTR. */ static inline struct folio *write_begin_get_folio(const struct kiocb *iocb, struct address_space *mapping, pgoff_t index, size_t len) { fgf_t fgp_flags = FGP_WRITEBEGIN; fgp_flags |= fgf_set_order(len); if (iocb && iocb->ki_flags & IOCB_DONTCACHE) fgp_flags |= FGP_DONTCACHE; return __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); } /** * filemap_get_folio - Find and get a folio. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned with an increased refcount. * * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_get_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, 0, 0); } /** * filemap_lock_folio - Find and lock a folio. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If a folio is * present, it is returned locked with an increased refcount. * * Context: May sleep. * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for * this index. Will not return a shadow, swap or DAX entry. */ static inline struct folio *filemap_lock_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, FGP_LOCK, 0); } /** * filemap_grab_folio - grab a folio from the page cache * @mapping: The address space to search * @index: The page index * * Looks up the page cache entry at @mapping & @index. If no folio is found, * a new folio is created. The folio is locked, marked as accessed, and * returned. * * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found * and failed to create a folio. */ static inline struct folio *filemap_grab_folio(struct address_space *mapping, pgoff_t index) { return __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_mask(mapping)); } /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * Otherwise, %NULL is returned. */ static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, fgf_t fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search * @index: the page index * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page or %NULL if there is no page in the cache for this * index. */ static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK, 0); } /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * If the page is not present, a new page is allocated using @gfp_mask * and added to the page cache and the VM's LRU list. The page is * returned locked and with an increased refcount. * * On memory exhaustion, %NULL is returned. * * find_or_create_page() may sleep, even if @gfp_flags specifies an * atomic allocation! */ static inline struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, gfp_mask); } /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * * Returns locked page at given index in given cache, creating it if * needed, but do not wait if the page is locked or to reclaim memory. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. * * Clear __GFP_FS when allocating the page to avoid recursion into the fs * and deadlock against the caller's locked page. */ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); } /** * folio_next_index - Get the index of the next folio. * @folio: The current folio. * * Return: The index of the folio which follows this folio in the file. */ static inline pgoff_t folio_next_index(const struct folio *folio) { return folio->index + folio_nr_pages(folio); } /** * folio_next_pos - Get the file position of the next folio. * @folio: The current folio. * * Return: The position of the folio which follows this folio in the file. */ static inline loff_t folio_next_pos(const struct folio *folio) { return (loff_t)folio_next_index(folio) << PAGE_SHIFT; } /** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. * @index: The index we want to look up. * * Sometimes after looking up a folio in the page cache, we need to * obtain the specific page for an index (eg a page fault). * * Return: The page containing the file data for this index. */ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) { return folio_page(folio, index & (folio_nr_pages(folio) - 1)); } /** * folio_contains - Does this folio contain this index? * @folio: The folio. * @index: The page index within the file. * * Context: The caller should have the folio locked and ensure * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ static inline bool folio_contains(const struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); } unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index, gfp_t flags); struct page *read_cache_page(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, struct file *file) { return read_cache_page(mapping, index, NULL, file); } static inline struct folio *read_mapping_folio(struct address_space *mapping, pgoff_t index, struct file *file) { return read_cache_folio(mapping, index, NULL, file); } /** * page_pgoff - Calculate the logical page offset of this page. * @folio: The folio containing this page. * @page: The page which we need the offset of. * * For file pages, this is the offset from the beginning of the file * in units of PAGE_SIZE. For anonymous pages, this is the offset from * the beginning of the anon_vma in units of PAGE_SIZE. This will * return nonsense for KSM pages. * * Context: Caller must have a reference on the folio or otherwise * prevent it from being split or freed. * * Return: The offset in units of PAGE_SIZE. */ static inline pgoff_t page_pgoff(const struct folio *folio, const struct page *page) { return folio->index + folio_page_idx(folio, page); } /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. */ static inline loff_t folio_pos(const struct folio *folio) { return ((loff_t)folio->index) * PAGE_SIZE; } /* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) { struct folio *folio = page_folio(page); return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ static inline pgoff_t folio_pgoff(const struct folio *folio) { return folio->index; } static inline pgoff_t linear_page_index(const struct vm_area_struct *vma, const unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; } struct wait_page_key { struct folio *folio; int bit_nr; int page_match; }; struct wait_page_queue { struct folio *folio; int bit_nr; wait_queue_entry_t wait; }; static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { if (wait_page->folio != key->folio) return false; key->page_match = 1; if (wait_page->bit_nr != key->bit_nr) return false; return true; } void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); /** * folio_trylock() - Attempt to lock a folio. * @folio: The folio to attempt to lock. * * Sometimes it is undesirable to wait for a folio to be unlocked (eg * when the locks are being taken in the wrong order, or if making * progress through a batch of folios is more important than processing * them in order). Usually folio_lock() is the correct function to call. * * Context: Any context. * Return: Whether the lock was successfully acquired. */ static inline bool folio_trylock(struct folio *folio) { return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); } /* * Return true if the page was successfully locked */ static inline bool trylock_page(struct page *page) { return folio_trylock(page_folio(page)); } /** * folio_lock() - Lock this folio. * @folio: The folio to lock. * * The folio lock protects against many things, probably more than it * should. It is primarily held while a folio is being brought uptodate, * either from its backing file or from swap. It is also held while a * folio is being truncated from its address_space, so holding the lock * is sufficient to keep folio->mapping stable. * * The folio lock is also held while write() is modifying the page to * provide POSIX atomicity guarantees (as long as the write does not * cross a page boundary). Other modifications to the data in the folio * do not hold the folio lock and can race with writes, eg DMA and stores * to mapped pages. * * Context: May sleep. If you need to acquire the locks of two or * more folios, they must be in order of ascending index, if they are * in the same address_space. If they are in different address_spaces, * acquire the lock of the folio which belongs to the address_space which * has the lowest address in memory first. */ static inline void folio_lock(struct folio *folio) { might_sleep(); if (!folio_trylock(folio)) __folio_lock(folio); } /** * lock_page() - Lock the folio containing this page. * @page: The page to lock. * * See folio_lock() for a description of what the lock protects. * This is a legacy function and new code should probably use folio_lock() * instead. * * Context: May sleep. Pages in the same folio share a lock, so do not * attempt to lock two pages which share a folio. */ static inline void lock_page(struct page *page) { struct folio *folio; might_sleep(); folio = page_folio(page); if (!folio_trylock(folio)) __folio_lock(folio); } /** * folio_lock_killable() - Lock this folio, interruptible by a fatal signal. * @folio: The folio to lock. * * Attempts to lock the folio, like folio_lock(), except that the sleep * to acquire the lock is interruptible by a fatal signal. * * Context: May sleep; see folio_lock(). * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received. */ static inline int folio_lock_killable(struct folio *folio) { might_sleep(); if (!folio_trylock(folio)) return __folio_lock_killable(folio); return 0; } /* * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ static inline vm_fault_t folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { might_sleep(); if (!folio_trylock(folio)) return __folio_lock_or_retry(folio, vmf); return 0; } /* * This is exported only for folio_wait_locked/folio_wait_writeback, etc., * and should not be used directly. */ void folio_wait_bit(struct folio *folio, int bit_nr); int folio_wait_bit_killable(struct folio *folio, int bit_nr); /* * Wait for a folio to be unlocked. * * This must be called with the caller "holding" the folio, * ie with increased folio reference count so that the folio won't * go away during the wait. */ static inline void folio_wait_locked(struct folio *folio) { if (folio_test_locked(folio)) folio_wait_bit(folio, PG_locked); } static inline int folio_wait_locked_killable(struct folio *folio) { if (!folio_test_locked(folio)) return 0; return folio_wait_bit_killable(folio, PG_locked); } void folio_end_read(struct folio *folio, bool success); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void folio_end_writeback_no_dropbehind(struct folio *folio); void folio_end_dropbehind(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) { /* Avoid atomic ops, locking, etc. when not actually needed. */ if (folio_test_dirty(folio)) __folio_cancel_dirty(folio); } bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); bool noop_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_MIGRATION int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); #else #define filemap_migrate_folio NULL #endif void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); /* * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); size_t fault_in_subpage_writeable(char __user *uaddr, size_t size); size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); /* Must be non-static for BPF error injection */ int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp); bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte); /** * filemap_range_needs_writeback - check if range potentially needs writeback * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. Used by O_DIRECT * read/write with IOCB_NOWAIT, to see if the caller needs to do * filemap_write_and_wait_range() before proceeding. * * Return: %true if the caller should do filemap_write_and_wait_range() before * doing O_DIRECT to a page in this range, %false otherwise. */ static inline bool filemap_range_needs_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { if (!mapping->nrpages) return false; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) return false; return filemap_range_has_writeback(mapping, start_byte, end_byte); } /** * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which * implement the ->readahead method should call readahead_folio() or * __readahead_batch() in a loop and attempt to start reads into each * folio in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. * * @file: The file, used primarily by network filesystems for authentication. * May be NULL if invoked internally by the filesystem. * @mapping: Readahead this filesystem object. * @ra: File readahead state. May be NULL. */ struct readahead_control { struct file *file; struct address_space *mapping; struct file_ra_state *ra; /* private: use the readahead_* accessors instead */ pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; bool dropbehind; bool _workingset; unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ struct readahead_control ractl = { \ .file = f, \ .mapping = m, \ .ra = r, \ ._index = i, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); void page_cache_sync_ra(struct readahead_control *, unsigned long req_count); void page_cache_async_ra(struct readahead_control *, struct folio *, unsigned long req_count); void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len); /** * page_cache_sync_readahead - generic file readahead * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more * pages onto the read request if access patterns suggest it will improve * performance. */ static inline void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); page_cache_sync_ra(&ractl, req_count); } /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @folio: The folio which triggered the readahead call. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, struct folio *folio, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index); page_cache_async_ra(&ractl, folio, req_count); } static inline struct folio *__readahead_folio(struct readahead_control *ractl) { struct folio *folio; BUG_ON(ractl->_batch_count > ractl->_nr_pages); ractl->_nr_pages -= ractl->_batch_count; ractl->_index += ractl->_batch_count; if (!ractl->_nr_pages) { ractl->_batch_count = 0; return NULL; } folio = xa_load(&ractl->mapping->i_pages, ractl->_index); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); ractl->_batch_count = folio_nr_pages(folio); return folio; } /** * readahead_folio - Get the next folio to read. * @ractl: The current readahead request. * * Context: The folio is locked. The caller should unlock the folio once * all I/O to that folio has completed. * Return: A pointer to the next folio, or %NULL if we are done. */ static inline struct folio *readahead_folio(struct readahead_control *ractl) { struct folio *folio = __readahead_folio(ractl); if (folio) folio_put(folio); return folio; } static inline unsigned int __readahead_batch(struct readahead_control *rac, struct page **array, unsigned int array_sz) { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; rac->_batch_count = 0; xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { if (xas_retry(&xas, folio)) continue; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); array[i++] = folio_page(folio, 0); rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } rcu_read_unlock(); return i; } /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ static inline loff_t readahead_pos(const struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } /** * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ static inline size_t readahead_length(const struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } /** * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ static inline pgoff_t readahead_index(const struct readahead_control *rac) { return rac->_index; } /** * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ static inline unsigned int readahead_count(const struct readahead_control *rac) { return rac->_nr_pages; } /** * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ static inline size_t readahead_batch_length(const struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } static inline unsigned long dir_pages(const struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; } /** * folio_mkwrite_check_truncate - check if folio was truncated * @folio: the folio to check * @inode: the inode to check the folio against * * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio, const struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; size_t offset = offset_in_folio(folio, size); if (!folio->mapping) return -EFAULT; /* folio is wholly inside EOF */ if (folio_next_index(folio) - 1 < index) return folio_size(folio); /* folio is wholly past EOF */ if (folio->index > index || !offset) return -EFAULT; /* folio is partially inside EOF */ return offset; } /** * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. * @folio: The folio. * * If the block size is larger than the size of this folio, return zero. * * Context: The caller should hold a refcount on the folio to prevent it * from being split. * Return: The number of filesystem blocks covered by this folio. */ static inline unsigned int i_blocks_per_folio(const struct inode *inode, const struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } #endif /* _LINUX_PAGEMAP_H */
729 1493 2076 2080 3379 3378 2686 282 1502 1502 33 3380 3384 3379 3377 315 313 3378 314 3382 3384 3382 1521 2684 2683 2684 1517 1517 1521 1520 3 1518 1521 1519 1517 1521 309 1516 1518 1514 1519 1521 1519 1516 306 1520 1517 1521 1519 1521 1517 1520 1520 1520 1494 1488 659 1489 26 213 1518 1520 312 1521 1521 1521 1218 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt #include <linux/mmu_context.h> #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/execmem.h> #include <asm/text-patching.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/ibt.h> #include <asm/set_memory.h> #include <asm/nmi.h> int __read_mostly alternatives_patched; EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) #define DA_ALL (~0) #define DA_ALT 0x01 #define DA_RET 0x02 #define DA_RETPOLINE 0x04 #define DA_ENDBR 0x08 #define DA_SMP 0x10 static unsigned int debug_alternative; static int __init debug_alt(char *str) { if (str && *str == '=') str++; if (!str || kstrtouint(str, 0, &debug_alternative)) debug_alternative = DA_ALL; return 1; } __setup("debug-alternative", debug_alt); static int noreplace_smp; static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; return 1; } __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(type, fmt, args...) \ do { \ if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ break; \ \ printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ } \ } while (0) static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, BYTES_NOP3, BYTES_NOP4, BYTES_NOP5, BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, #ifdef CONFIG_64BIT BYTES_NOP9, BYTES_NOP10, BYTES_NOP11, #endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, x86nops, x86nops + 1, x86nops + 1 + 2, x86nops + 1 + 2 + 3, x86nops + 1 + 2 + 3 + 4, x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, #ifdef CONFIG_64BIT x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, #endif }; #ifdef CONFIG_FINEIBT static bool cfi_paranoid __ro_after_init; #endif #ifdef CONFIG_MITIGATION_ITS #ifdef CONFIG_MODULES static struct module *its_mod; #endif static void *its_page; static unsigned int its_offset; struct its_array its_pages; static void *__its_alloc(struct its_array *pages) { void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), GFP_KERNEL); if (!tmp) return NULL; pages->pages = tmp; pages->pages[pages->num++] = page; return no_free_ptr(page); } /* Initialize a thunk with the "jmp *reg; int3" instructions. */ static void *its_init_thunk(void *thunk, int reg) { u8 *bytes = thunk; int offset = 0; int i = 0; #ifdef CONFIG_FINEIBT if (cfi_paranoid) { /* * When ITS uses indirect branch thunk the fineibt_paranoid * caller sequence doesn't fit in the caller site. So put the * remaining part of the sequence (UDB + JNE) into the ITS * thunk. */ bytes[i++] = 0xd6; /* UDB */ bytes[i++] = 0x75; /* JNE */ bytes[i++] = 0xfd; offset = 1; } #endif if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } bytes[i++] = 0xff; bytes[i++] = 0xe0 + reg; /* JMP *reg */ bytes[i++] = 0xcc; return thunk + offset; } static void its_pages_protect(struct its_array *pages) { for (int i = 0; i < pages->num; i++) { void *page = pages->pages[i]; execmem_restore_rox(page, PAGE_SIZE); } } static void its_fini_core(void) { if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) its_pages_protect(&its_pages); kfree(its_pages.pages); } #ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; mutex_lock(&text_mutex); its_mod = mod; its_page = NULL; } void its_fini_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; WARN_ON_ONCE(its_mod != mod); its_mod = NULL; its_page = NULL; mutex_unlock(&text_mutex); if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) its_pages_protect(&mod->arch.its_pages); } void its_free_mod(struct module *mod) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; for (int i = 0; i < mod->arch.its_pages.num; i++) { void *page = mod->arch.its_pages.pages[i]; execmem_free(page); } kfree(mod->arch.its_pages.pages); } #endif /* CONFIG_MODULES */ static void *its_alloc(void) { struct its_array *pages = &its_pages; void *page; #ifdef CONFIG_MODULES if (its_mod) pages = &its_mod->arch.its_pages; #endif page = __its_alloc(pages); if (!page) return NULL; if (pages == &its_pages) set_memory_x((unsigned long)page, 1); return page; } static void *its_allocate_thunk(int reg) { int size = 3 + (reg / 8); void *thunk; #ifdef CONFIG_FINEIBT /* * The ITS thunk contains an indirect jump and an int3 instruction so * its size is 3 or 4 bytes depending on the register used. If CFI * paranoid is used then 3 extra bytes are added in the ITS thunk to * complete the fineibt_paranoid caller sequence. */ if (cfi_paranoid) size += 3; #endif if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { its_page = its_alloc(); if (!its_page) { pr_err("ITS page allocation failed\n"); return NULL; } memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); its_offset = 32; } /* * If the indirect branch instruction will be in the lower half * of a cacheline, then update the offset to reach the upper half. */ if ((its_offset + size - 1) % 64 < 32) its_offset = ((its_offset - 1) | 0x3F) + 33; thunk = its_page + its_offset; its_offset += size; return its_init_thunk(thunk, reg); } u8 *its_static_thunk(int reg) { u8 *thunk = __x86_indirect_its_thunk_array[reg]; #ifdef CONFIG_FINEIBT /* Paranoid thunk starts 2 bytes before */ if (cfi_paranoid) return thunk - 2; #endif return thunk; } #else static inline void its_fini_core(void) {} #endif /* CONFIG_MITIGATION_ITS */ /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: * * @instr: source address of the original instructions in the kernel text as * generated by the compiler. * * @buf: temporary buffer on which the patching operates. This buffer is * eventually text-poked into the kernel image. * * @replacement/@repl: pointer to the opcodes which are replacing @instr, located * in the .altinstr_replacement section. */ /* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ static void add_nop(u8 *buf, unsigned int len) { u8 *target = buf + len; if (!len) return; if (len <= ASM_NOP_MAX) { memcpy(buf, x86_nops[len], len); return; } if (len < 128) { __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); buf += JMP8_INSN_SIZE; } else { __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); buf += JMP32_INSN_SIZE; } for (;buf < target; buf++) *buf = INT3_INSN_OPCODE; } /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ static int skip_nops(u8 *buf, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { if (insn_decode_kernel(&insn, &buf[offset])) break; if (!insn_is_nop(&insn)) break; } return offset; } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { for (int next, i = 0; i < len; i = next) { struct insn insn; if (insn_decode_kernel(&insn, &buf[i])) return; next = i + insn.length; if (insn_is_nop(&insn)) { int nop = i; /* Has the NOP already been optimized? */ if (i + insn.length == len) return; next = skip_nops(buf, next, len); add_nop(buf + nop, next - nop); DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); } } } /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the * toolchain. * "Destination" is where the instructions are being patched in by this * machinery. * * The source offset is: * * src_imm = target - src_next_ip (1) * * and the target offset is: * * dst_imm = target - dst_next_ip (2) * * so rework (1) as an expression for target like: * * target = src_imm + src_next_ip (1a) * * and substitute in (2) to get: * * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) * * Now, since the instruction stream is 'identical' at src and dst (it * is being copied after all) it can be stated that: * * src_next_ip = src + ip_offset * dst_next_ip = dst + ip_offset (4) * * Substitute (4) in (3) and observe ip_offset being cancelled out to * obtain: * * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) * = src_imm + src - dst + ip_offset - ip_offset * = src_imm + src - dst (5) * * IOW, only the relative displacement of the code block matters. */ #define apply_reloc_n(n_, p_, d_) \ do { \ s32 v = *(s##n_ *)(p_); \ v += (d_); \ BUG_ON((v >> 31) != (v >> (n_-1))); \ *(s##n_ *)(p_) = (s##n_)v; \ } while (0) static __always_inline void apply_reloc(int n, void *ptr, uintptr_t diff) { switch (n) { case 1: apply_reloc_n(8, ptr, diff); break; case 2: apply_reloc_n(16, ptr, diff); break; case 4: apply_reloc_n(32, ptr, diff); break; default: BUG(); } } static __always_inline bool need_reloc(unsigned long offset, u8 *src, size_t src_len) { u8 *target = src + offset; /* * If the target is inside the patched block, it's relative to the * block itself and does not need relocation. */ return (target < src || target > src + src_len); } static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { for (int next, i = 0; i < instrlen; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) return; next = i + insn.length; switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || insn.opcode.bytes[1] > 0x8f) break; fallthrough; /* Jcc.d32 */ case 0x70 ... 0x7f: /* Jcc.d8 */ case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: if (need_reloc(next + insn.immediate.value, repl, repl_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), repl - instr); } /* * Where possible, convert JMP.d32 into JMP.d8. */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; imm += repl - instr; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; buf[i+1] = (s8)imm; memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); } } break; } if (insn_rip_relative(&insn)) { if (need_reloc(next + insn.displacement.value, repl, repl_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), repl - instr); } } } } void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); } /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(nop_func, "", .entry.text); EXPORT_SYMBOL_GPL(nop_func); noinstr void BUG_func(void) { BUG(); } EXPORT_SYMBOL(BUG_func); #define CALL_RIP_REL_OPCODE 0xff #define CALL_RIP_REL_MODRM 0x15 /* * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { void *target, *bug = &BUG_func; s32 disp; if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); BUG(); } if (a->instrlen != 6 || instr[0] != CALL_RIP_REL_OPCODE || instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ target = *(void **)(instr + a->instrlen + disp); #else /* ff 15 00 00 00 00 call *0x0 */ /* target address is stored at disp. */ target = *(void **)disp; #endif if (!target) target = bug; /* (BUG_func - .) + (target - BUG_func) := target - . */ *(s32 *)(insn_buff + 1) += target - bug; if (target == &nop_func) return 0; return 5; } static inline u8 * instr_va(struct alt_instr *i) { return (u8 *)&i->instr_offset + i->instr_offset; } struct patch_site { u8 *instr; struct alt_instr *alt; u8 buff[MAX_PATCH_LEN]; u8 len; }; static struct alt_instr * __init_or_module analyze_patch_site(struct patch_site *ps, struct alt_instr *start, struct alt_instr *end) { struct alt_instr *alt = start; ps->instr = instr_va(start); /* * In case of nested ALTERNATIVE()s the outer alternative might add * more padding. To ensure consistent patching find the max padding for * all alt_instr entries for this site (nested alternatives result in * consecutive entries). * Find the last alt_instr eligible for patching at the site. */ for (; alt < end && instr_va(alt) == ps->instr; alt++) { ps->len = max(ps->len, alt->instrlen); BUG_ON(alt->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ if (!boot_cpu_has(alt->cpuid) != !(alt->flags & ALT_FLAG_NOT)) ps->alt = alt; } BUG_ON(ps->len > sizeof(ps->buff)); return alt; } static void __init_or_module prep_patch_site(struct patch_site *ps) { struct alt_instr *alt = ps->alt; u8 buff_sz; u8 *repl; if (!alt) { /* Nothing to patch, use original instruction. */ memcpy(ps->buff, ps->instr, ps->len); return; } repl = (u8 *)&alt->repl_offset + alt->repl_offset; DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", alt->cpuid >> 5, alt->cpuid & 0x1f, ps->instr, ps->instr, ps->len, repl, alt->replacementlen, alt->flags); memcpy(ps->buff, repl, alt->replacementlen); buff_sz = alt->replacementlen; if (alt->flags & ALT_FLAG_DIRECT_CALL) buff_sz = alt_replace_call(ps->instr, ps->buff, alt); for (; buff_sz < ps->len; buff_sz++) ps->buff[buff_sz] = 0x90; __apply_relocation(ps->buff, ps->instr, ps->len, repl, alt->replacementlen); DUMP_BYTES(ALT, ps->instr, ps->len, "%px: old_insn: ", ps->instr); DUMP_BYTES(ALT, repl, alt->replacementlen, "%px: rpl_insn: ", repl); DUMP_BYTES(ALT, ps->buff, ps->len, "%px: final_insn: ", ps->instr); } static void __init_or_module patch_site(struct patch_site *ps) { optimize_nops(ps->instr, ps->buff, ps->len); text_poke_early(ps->instr, ps->buff, ps->len); } /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. * * Marked "noinline" to cause control flow change and thus insn cache * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. * * Disable KASAN until the patching is complete. */ kasan_disable_current(); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * * So be careful if you want to change the scan order to any other * order. */ a = start; while (a < end) { struct patch_site ps = { .alt = NULL, .len = 0 }; a = analyze_patch_site(&ps, a, end); prep_patch_site(&ps); patch_site(&ps); } kasan_enable_current(); } static inline bool is_jcc32(struct insn *insn) { /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * [CS]{,3} CALL/JMP *%\reg [INT3]* */ static int emit_indirect(int op, int reg, u8 *bytes, int len) { int cs = 0, bp = 0; int i = 0; u8 modrm; /* * Set @len to the excess bytes after writing the instruction. */ len -= 2 + (reg >= 8); WARN_ON_ONCE(len < 0); switch (op) { case CALL_INSN_OPCODE: modrm = 0x10; /* Reg = 2; CALL r/m */ /* * Additional NOP is better than prefix decode penalty. */ if (len <= 3) cs = len; break; case JMP32_INSN_OPCODE: modrm = 0x20; /* Reg = 4; JMP r/m */ bp = len; break; default: WARN_ON_ONCE(1); return -1; } while (cs--) bytes[i++] = 0x2e; /* CS-prefix */ if (reg >= 8) { bytes[i++] = 0x41; /* REX.B prefix */ reg -= 8; } modrm |= 0xc0; /* Mod = 3 */ modrm += reg; bytes[i++] = 0xff; /* opcode */ bytes[i++] = modrm; while (bp--) bytes[i++] = 0xcc; /* INT3 */ return i; } static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; /* * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional * tail-calls. Deal with them. */ if (is_jcc32(insn)) { bytes[i++] = op; op = insn->opcode.bytes[1]; goto clang_jcc; } if (insn->length == 6) bytes[i++] = 0x2e; /* CS-prefix */ switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; default: WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); return -1; } WARN_ON_ONCE(i != insn->length); return i; } static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { return __emit_trampoline(addr, insn, bytes, __x86_indirect_call_thunk_array[reg], __x86_indirect_jump_thunk_array[reg]); } #ifdef CONFIG_MITIGATION_ITS static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 *thunk = __x86_indirect_its_thunk_array[reg]; u8 *tmp = its_allocate_thunk(reg); if (tmp) thunk = tmp; return __emit_trampoline(addr, insn, bytes, thunk, thunk); } /* Check if an indirect branch is at ITS-unsafe address */ static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) { if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return false; /* Indirect branch opcode is 2 or 3 bytes depending on reg */ addr += 1 + reg / 8; /* Lower-half of the cacheline? */ return !(addr & 0x20); } #else /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_FINEIBT static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) { return false; } #endif #endif /* CONFIG_MITIGATION_ITS */ /* * Rewrite the compiler generated retpoline thunk calls. * * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate * indirect instructions, avoiding the extra indirection. * * For example, convert: * * CALL __x86_indirect_thunk_\reg * * into: * * CALL *%\reg * * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { retpoline_thunk_t *target; int reg, ret, i = 0; u8 op, cc; target = addr + insn->length + insn->immediate.value; reg = target - __x86_indirect_thunk_array; if (WARN_ON_ONCE(reg & ~0xf)) return -1; /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) return emit_call_track_retpoline(addr, insn, reg, bytes); return -1; } op = insn->opcode.bytes[0]; /* * Convert: * * Jcc.d32 __x86_indirect_thunk_\reg * * into: * * Jncc.d8 1f * [ LFENCE ] * JMP *%\reg * [ NOP ] * 1: */ if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ bytes[i++] = 0x70 + cc; /* Jcc.d8 */ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ op = JMP32_INSN_OPCODE; } /* * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ } #ifdef CONFIG_MITIGATION_ITS /* * Check if the address of last byte of emitted-indirect is in * lower-half of the cacheline. Such branches need ITS mitigation. */ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) return emit_its_trampoline(addr, insn, reg, bytes); #endif ret = emit_indirect(op, reg, bytes + i, insn->length - i); if (ret < 0) return ret; i += ret; for (; i < insn->length;) bytes[i++] = BYTES_NOP1; return i; } /* * Generated by 'objtool --retpoline'. */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op1 = insn.opcode.bytes[0]; op2 = insn.opcode.bytes[1]; switch (op1) { case 0x70 ... 0x7f: /* Jcc.d8 */ /* See cfi_paranoid. */ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); continue; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: /* Check for cfi_paranoid + ITS */ dest = addr + insn.length + insn.immediate.value; if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) { WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); continue; } break; case 0x0f: /* escape */ if (op2 >= 0x80 && op2 <= 0x8f) break; fallthrough; default: WARN_ON_ONCE(1); continue; } DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #ifdef CONFIG_MITIGATION_RETHUNK bool cpu_wants_rethunk(void) { return cpu_feature_enabled(X86_FEATURE_RETHUNK); } bool cpu_wants_rethunk_at(void *addr) { if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) return false; if (x86_return_thunk != its_return_thunk) return true; return !((unsigned long)addr & 0x20); } /* * Rewrite the compiler generated return thunk tail-calls. * * For example, convert: * * JMP __x86_return_thunk * * into: * * RET */ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; /* Patch the custom return thunks... */ if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; return i; } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; int len, ret; u8 bytes[16]; u8 op; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op == JMP32_INSN_OPCODE) dest = addr + insn.length + insn.immediate.value; if (__static_call_fixup(addr, op, dest) || WARN_ONCE(dest != &__x86_return_thunk, "missing return thunk: %pS-%pS: %*ph", addr, dest, 5, addr)) continue; DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } #else /* !CONFIG_MITIGATION_RETHUNK: */ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT __noendbr bool is_endbr(u32 *val) { u32 endbr; __get_kernel_nofault(&endbr, val, u32, Efault); return __is_endbr(endbr); Efault: return false; } #ifdef CONFIG_FINEIBT static __noendbr bool exact_endbr(u32 *val) { u32 endbr; __get_kernel_nofault(&endbr, val, u32, Efault); return endbr == gen_endbr(); Efault: return false; } #endif static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr) { u32 poison = gen_endbr_poison(); if (WARN_ON_ONCE(!is_endbr(addr))) return; DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } /* * Generated by: objtool --ibt * * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) poison_cfi(addr - CFI_OFFSET); } } #else /* !CONFIG_X86_KERNEL_IBT: */ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* !CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; static bool cfi_debug __ro_after_init; #ifdef CONFIG_FINEIBT_BHI bool cfi_bhi __ro_after_init = false; #endif #ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; func -= cfi_get_offset(); switch (cfi_mode) { case CFI_FINEIBT: func += 7; break; case CFI_KCFI: func += 1; break; default: return 0; } if (get_kernel_nofault(hash, func)) return 0; return hash; } int cfi_get_func_arity(void *func) { bhi_thunk *target; s32 disp; if (cfi_mode != CFI_FINEIBT && !cfi_bhi) return 0; if (get_kernel_nofault(disp, func - 4)) return 0; target = func + disp; return target - __bhi_args; } #endif #ifdef CONFIG_FINEIBT static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. */ static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) hash ^= 0x80200003; } return hash; } static __init int cfi_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "auto")) { cfi_mode = CFI_AUTO; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; } else if (!strcmp(str, "debug")) { cfi_debug = true; } else if (!strcmp(str, "kcfi")) { cfi_mode = CFI_KCFI; } else if (!strcmp(str, "fineibt")) { cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; } else if (!strcmp(str, "warn")) { pr_alert("CFI: mismatch non-fatal!\n"); cfi_warn = true; } else if (!strcmp(str, "paranoid")) { if (cfi_mode == CFI_FINEIBT) { cfi_paranoid = true; } else { pr_err("CFI: ignoring paranoid; depends on fineibt.\n"); } } else if (!strcmp(str, "bhi")) { #ifdef CONFIG_FINEIBT_BHI if (cfi_mode == CFI_FINEIBT) { cfi_bhi = true; } else { pr_err("CFI: ignoring bhi; depends on fineibt.\n"); } #else pr_err("CFI: ignoring bhi; depends on FINEIBT_BHI=y.\n"); #endif } else { pr_err("CFI: Ignoring unknown option (%s).", str); } str = next; } return 0; } early_param("cfi", cfi_parse_cmdline); /* * kCFI FineIBT * * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%eax // 5 * nop jne.d32,pn \func+3 // 7 * nop * nop * nop * nop * nop * nop * nop * nop * nop * \func: \func: * endbr64 nopl -42(%rax) * * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5 * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop5 // 5 * ud2 // 2 * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * * * Notably, the FineIBT sequences are crafted such that branches are presumed * non-taken. This is based on Agner Fog's optimization manual, which states: * * "Make conditional jumps most often not taken: The efficiency and throughput * for not-taken branches is better than for taken branches on most * processors. Therefore, it is good to place the most frequent branch first" */ /* * <fineibt_preamble_start>: * 0: f3 0f 1e fa endbr64 * 4: 2d 78 56 34 12 sub $0x12345678, %eax * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13> * 10: 0f 1f 40 d6 nopl -0x2a(%rax) * * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as * UDB on x86_64 and raises #UD. */ asm( ".pushsection .rodata \n" "fineibt_preamble_start: \n" " endbr64 \n" " subl $0x12345678, %eax \n" "fineibt_preamble_bhi: \n" " cs jne.d32 fineibt_preamble_start+0x13 \n" "#fineibt_func: \n" " nopl -42(%rax) \n" "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) #define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) #define fineibt_preamble_ud 0x13 #define fineibt_preamble_hash 5 #define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) /* * <fineibt_caller_start>: * 0: b8 78 56 34 12 mov $0x12345678, %eax * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11 * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %eax \n" " lea -0x10(%r11), %r11 \n" ASM_NOP5 "fineibt_caller_end: \n" ".popsection \n" ); extern u8 fineibt_caller_start[]; extern u8 fineibt_caller_end[]; #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) #define fineibt_caller_hash 1 #define fineibt_caller_jmp (fineibt_caller_size - 2) /* * Since FineIBT does hash validation on the callee side it is prone to * circumvention attacks where a 'naked' ENDBR instruction exists that * is not part of the fineibt_preamble sequence. * * Notably the x86 entry points must be ENDBR and equally cannot be * fineibt_preamble. * * The fineibt_paranoid caller sequence adds additional caller side * hash validation. This stops such circumvention attacks dead, but at the cost * of adding a load. * * <fineibt_paranoid_start>: * 0: b8 78 56 34 12 mov $0x12345678, %eax * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11 * e: 75 fd jne d <fineibt_paranoid_start+0xd> * 10: 41 ff d3 call *%r11 * 13: 90 nop * * Notably LEA does not modify flags and can be reordered with the CMP, * avoiding a dependency. Again, using a non-taken (backwards) branch * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the * Jcc.d8, causing #UD. */ asm( ".pushsection .rodata \n" "fineibt_paranoid_start: \n" " mov $0x12345678, %eax \n" " cmpl -11(%r11), %eax \n" " cs lea -0x10(%r11), %r11 \n" "#fineibt_caller_size: \n" " jne fineibt_paranoid_start+0xd \n" "fineibt_paranoid_ind: \n" " cs call *%r11 \n" "fineibt_paranoid_end: \n" ".popsection \n" ); extern u8 fineibt_paranoid_start[]; extern u8 fineibt_paranoid_ind[]; extern u8 fineibt_paranoid_end[]; #define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) #define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) #define fineibt_paranoid_ud 0xd static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ if (p[0] >= 0xb8 && p[0] < 0xc0) { if (reg) *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); } return 0; /* invalid hash value */ } static u32 decode_caller_hash(void *addr) { u8 *p = addr; /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); return 0; /* invalid hash value */ } /* .retpoline_sites */ static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate * in tact for later usage. Also see decode_caller_hash() and * cfi_rewrite_callers(). */ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, jmp, 2); } return 0; } static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. */ const u8 mov[] = { 0x41, 0xba }; s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); text_poke_early(addr + 1, &hash, 4); } return 0; } /* * Inline the bhi-arity 1 case: * * __cfi_foo: * 0: f3 0f 1e fa endbr64 * 4: 2d 78 56 34 12 sub $0x12345678, %eax * 9: 49 0f 45 fa cmovne %rax, %rdi * d: 2e 75 03 jne,pn foo+0x3 * * foo: * 10: 0f 1f 40 <d6> nopl -42(%rax) * * Notably, this scheme is incompatible with permissive CFI * because the CMOVcc is unconditional and RDI will have been * clobbered. */ asm( ".pushsection .rodata \n" "fineibt_bhi1_start: \n" " cmovne %rax, %rdi \n" " cs jne fineibt_bhi1_func + 0x3 \n" "fineibt_bhi1_func: \n" " nopl -42(%rax) \n" "fineibt_bhi1_end: \n" ".popsection \n" ); extern u8 fineibt_bhi1_start[]; extern u8 fineibt_bhi1_end[]; #define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start) static void cfi_fineibt_bhi_preamble(void *addr, int arity) { u8 bytes[MAX_INSN_SIZE]; if (!arity) return; if (!cfi_warn && arity == 1) { text_poke_early(addr + fineibt_preamble_bhi, fineibt_bhi1_start, fineibt_bhi1_size); return; } /* * Replace the bytes at fineibt_preamble_bhi with a CALL instruction * that lines up exactly with the end of the preamble, such that the * return address will be foo+0. * * __cfi_foo: * 0: f3 0f 1e fa endbr64 * 4: 2d 78 56 34 12 sub $0x12345678, %eax * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity] */ bytes[0] = 0x2e; bytes[1] = 0x2e; __text_gen_insn(bytes + 2, CALL_INSN_OPCODE, addr + fineibt_preamble_bhi + 2, __bhi_args[arity], CALL_INSN_SIZE); text_poke_early(addr + fineibt_preamble_bhi, bytes, 7); } static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; int arity; u32 hash; /* * When the function doesn't start with ENDBR the compiler will * have determined there are no indirect calls to it and we * don't need no CFI either. */ if (!is_endbr(addr + CFI_OFFSET)) continue; hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; /* * FineIBT relies on being at func-16, so if the preamble is * actually larger than that, place it the tail end. * * NOTE: this is possible with things like DEBUG_CALL_THUNKS * and DEBUG_FORCE_FUNCTION_ALIGN_64B. */ addr += CFI_OFFSET - fineibt_prefix_size; text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, "kCFI preamble has wrong register at: %pS %*ph\n", addr, 5, addr); if (cfi_bhi) cfi_fineibt_bhi_preamble(addr, arity); } return 0; } static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; if (!exact_endbr(addr + CFI_OFFSET)) continue; poison_endbr(addr + CFI_OFFSET); } } /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; u32 hash; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); text_poke_early(addr + 2, &hash, 4); } } return 0; } static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; #ifdef CONFIG_MITIGATION_ITS u8 *tmp = its_allocate_thunk(reg); if (tmp) thunk = tmp; #endif return __emit_trampoline(addr, insn, bytes, thunk, thunk); } static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; struct insn insn; u8 bytes[20]; u32 hash; int ret; u8 op; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (!hash) continue; if (!cfi_paranoid) { text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); text_poke_early(addr + fineibt_caller_hash, &hash, 4); /* rely on apply_retpolines() */ continue; } /* cfi_paranoid */ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); if (WARN_ON_ONCE(ret < 0)) continue; op = insn.opcode.bytes[0]; if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { WARN_ON_ONCE(1); continue; } memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { emit_paranoid_trampoline(addr + fineibt_caller_size, &insn, 11, bytes + fineibt_caller_size); } else { int len = fineibt_paranoid_size - fineibt_paranoid_ind; ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len); if (WARN_ON_ONCE(ret != len)) continue; } text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; } #define pr_cfi_debug(X...) if (cfi_debug) pr_info(X) #define FINEIBT_WARN(_f, _v) \ WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v) static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { int ret; if (FINEIBT_WARN(fineibt_preamble_size, 20) || FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) || FINEIBT_WARN(fineibt_caller_size, 14) || FINEIBT_WARN(fineibt_paranoid_size, 20) || WARN_ON_ONCE(CFI_OFFSET < fineibt_prefix_size)) return; if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { /* * FRED has much saner context on exception entry and * is less easy to take advantage of. */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) cfi_paranoid = true; cfi_mode = CFI_FINEIBT; } } /* * Rewrite the callers to not use the __cfi_ stubs, such that we might * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ pr_cfi_debug("CFI: disabling all indirect call checking\n"); ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (cfi_rand) { if (builtin) { cfi_seed = get_random_u32(); cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } pr_cfi_debug("CFI: cfi_seed: 0x%08x\n", cfi_seed); pr_cfi_debug("CFI: rehashing all preambles\n"); ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; pr_cfi_debug("CFI: rehashing all indirect calls\n"); ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } else { pr_cfi_debug("CFI: rehashing disabled\n"); } switch (cfi_mode) { case CFI_OFF: if (builtin) pr_info("CFI: disabled\n"); return; case CFI_KCFI: pr_cfi_debug("CFI: re-enabling all indirect call checking\n"); ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; if (builtin) pr_info("CFI: Using %sretpoline kCFI\n", cfi_rand ? "rehashed " : ""); return; case CFI_FINEIBT: pr_cfi_debug("CFI: adding FineIBT to all preambles\n"); /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ pr_cfi_debug("CFI: rewriting indirect call sites to use FineIBT\n"); ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ pr_cfi_debug("CFI: removing old endbr insns\n"); cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) { pr_info("Using %sFineIBT%s CFI\n", cfi_paranoid ? "paranoid " : "", cfi_bhi ? "+BHI" : ""); } return; default: break; } err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } static inline void poison_hash(void *addr) { *(u32 *)addr = 0; } static void poison_cfi(void *addr) { /* * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, * some (static) functions for which they can determine the address * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. * * As such, these functions will get sealed, but we need to be careful * to not unconditionally scribble the previous function. */ switch (cfi_mode) { case CFI_FINEIBT: /* * FineIBT preamble is at func-16. */ addr += CFI_OFFSET - fineibt_prefix_size; /* * FineIBT prefix should start with an ENDBR. */ if (!is_endbr(addr)) break; /* * __cfi_\func: * nopl -42(%rax) * sub $0, %eax * jne \func+3 * \func: * nopl -42(%rax) */ poison_endbr(addr); poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: /* * kCFI prefix should start with a valid hash. */ if (!decode_preamble_hash(addr, NULL)) break; /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ poison_hash(addr + 1); break; default: break; } } /* * When regs->ip points to a 0xD6 byte in the FineIBT preamble, * return true and fill out target and type. * * We check the preamble by checking for the ENDBR instruction relative to the * UDB instruction. */ static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_preamble_ud; u32 hash; if (!exact_endbr((void *)addr)) return false; *target = addr + fineibt_prefix_size; __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->ax + hash; /* * Since regs->ip points to the middle of an instruction; it cannot * continue with the normal fixup. */ regs->ip = *target; return true; Efault: return false; } /* * regs->ip points to one of the UD2 in __bhi_args[]. */ static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr; u32 hash; if (!cfi_bhi) return false; if (regs->ip < (unsigned long)__bhi_args || regs->ip >= (unsigned long)__bhi_args_end) return false; /* * Fetch the return address from the stack, this points to the * FineIBT preamble. Since the CALL instruction is in the 5 last * bytes of the preamble, the return address is in fact the target * address. */ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); *target = addr; addr -= fineibt_prefix_size; if (!exact_endbr((void *)addr)) return false; __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); *type = (u32)regs->ax + hash; /* * The UD2 sites are constructed with a RET immediately following, * as such the non-fatal case can use the regular fixup. */ return true; Efault: return false; } static bool is_paranoid_thunk(unsigned long addr) { u32 thunk; __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); return (thunk & 0x00FFFFFF) == 0xfd75d6; Efault: return false; } /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; if (!cfi_paranoid) return false; if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { *target = regs->r11 + fineibt_prefix_size; *type = regs->ax; /* * Since the trapping instruction is the exact, but LOCK prefixed, * Jcc.d8 that got us here, the normal fixup will work. */ return true; } /* * The cfi_paranoid + ITS thunk combination results in: * * 0: b8 78 56 34 12 mov $0x12345678, %eax * 5: 41 3b 43 f7 cmp -11(%r11), %eax * a: 2e 3d 8d 5b f0 cs lea -0x10(%r11), %r11 * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 * * Where the paranoid_thunk looks like: * * 1d: <d6> udb * __x86_indirect_paranoid_thunk_r11: * 1e: 75 fd jne 1d * __x86_indirect_its_thunk_r11: * 20: 41 ff eb jmp *%r11 * 23: cc int3 * */ if (is_paranoid_thunk(regs->ip)) { *target = regs->r11 + fineibt_prefix_size; *type = regs->ax; regs->ip = *target; return true; } return false; } bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) { if (decode_fineibt_paranoid(regs, target, type)) return true; if (decode_fineibt_bhi(regs, target, type)) return true; return decode_fineibt_preamble(regs, target, type); } #else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { if (IS_ENABLED(CONFIG_CFI) && builtin) pr_info("CFI: Using standard kCFI\n"); } #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr) { } #endif #endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, start_cfi, end_cfi, /* .builtin = */ false); } #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } } static void alternatives_smp_unlock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { const s32 *poff; for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } } struct smp_alt_module { /* what is this ??? */ struct module *mod; char *name; /* ptrs to lock prefixes */ const s32 *locks; const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; u8 *text_end; struct list_head next; }; static LIST_HEAD(smp_alt_modules); static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) { struct smp_alt_module *smp; mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; if (num_possible_cpus() == 1) /* Don't bother remembering, we'll never have to undo it. */ goto smp_unlock; smp = kzalloc_obj(*smp); if (NULL == smp) /* we'll run the (safe but slow) SMP code then ... */ goto unlock; smp->mod = mod; smp->name = name; smp->locks = locks; smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); kfree(item); break; } mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) { struct smp_alt_module *mod; /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); uniproc_patched = false; } mutex_unlock(&text_mutex); } /* * Return 1 if the address range is reserved for SMP-alternatives. * Must hold text_mutex. */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; const s32 *poff; u8 *text_start = start; u8 *text_end = end; lockdep_assert_held(&text_mutex); list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; for (poff = mod->locks; poff < mod->locks_end; poff++) { const u8 *ptr = (const u8 *)poff + *poff; if (text_start <= ptr && text_end > ptr) return 1; } } return 0; } #endif /* CONFIG_SMP */ /* * Self-test for the INT3 based CALL emulation code. * * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up * properly and that there is a stack gap between the INT3 frame and the * previous context. Without this gap doing a virtual PUSH on the interrupted * stack would corrupt the INT3 IRET frame. * * See entry_{32,64}.S for more details. */ extern void int3_selftest_asm(unsigned int *ptr); asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_selftest_asm, @function\n" "int3_selftest_asm:\n" ANNOTATE_NOENDBR "\n" /* * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an * exception, then the int3_exception_nb notifier emulates a call to * int3_selftest_callee(). */ " int3; nop; nop; nop; nop\n" ASM_RET " .size int3_selftest_asm, . - int3_selftest_asm\n" " .popsection\n" ); extern void int3_selftest_callee(unsigned int *ptr); asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_selftest_callee, @function\n" "int3_selftest_callee:\n" ANNOTATE_NOENDBR "\n" " movl $0x1234, (%" _ASM_ARG1 ")\n" ASM_RET " .size int3_selftest_callee, . - int3_selftest_callee\n" " .popsection\n" ); extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long selftest = (unsigned long)&int3_selftest_asm; struct die_args *args = data; struct pt_regs *regs = args->regs; OPTIMIZER_HIDE_VAR(selftest); if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_selftest_callee); return NOTIFY_STOP; } /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, .priority = INT_MAX-1, /* last */ }; unsigned int val = 0; BUG_ON(register_die_notifier(&int3_exception_nb)); /* * Basically: int3_selftest_callee(&val); but really complicated :-) */ int3_selftest_asm(&val); BUG_ON(val != 0x1234); unregister_die_notifier(&int3_exception_nb); } static __initdata int __alt_reloc_selftest_addr; extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { WARN_ON(arg != &__alt_reloc_selftest_addr); } static noinline void __init alt_reloc_selftest(void) { /* * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: * * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 * * Getting this wrong will either crash and burn or tickle the WARN * above. */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) : ASM_CALL_CONSTRAINT : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); } void __init alternative_instructions(void) { u64 ibt; int3_selftest(); /* * The patching is not fully atomic, so try to avoid local * interruptions that might execute the to be patched code. * Other CPUs are not running. */ stop_nmi(); /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ /* * Make sure to set (artificial) features depending on used paravirt * functions which can later influence alternative patching. */ paravirt_set_cap(); /* Keep CET-IBT disabled until caller/callee are patched */ ibt = ibt_save(/*disable*/ true); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); cfi_debug = false; /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); its_fini_core(); /* * Adjust all CALL instructions to point to func()-10, including * those in .altinstr_replacement. */ callthunks_patch_builtin_calls(); apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Seal all functions that do not have their address taken. */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); ibt_restore(ibt); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); } if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); } #endif restart_nmi(); alternatives_patched = 1; alt_reloc_selftest(); } /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected against NMI or * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_NX) && is_module_text_address((unsigned long)addr)) { /* * Modules text is marked initially as non-executable, so the * code cannot be running and speculative code-fetches are * prevented. Just change the code. */ memcpy(addr, opcode, len); } else { local_irq_save(flags); memcpy(addr, opcode, len); sync_core(); local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but * that causes hangs on some VIA CPUs. */ } } __ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long text_poke_mm_addr; /* * Text poking creates and uses a mapping in the lower half of the * address space. Relax LASS enforcement when accessing the poking * address. * * objtool enforces a strict policy of "no function calls within AC=1 * regions". Adhere to the policy by using inline versions of * memcpy()/memset() that will never result in a function call. */ static void text_poke_memcpy(void *dst, const void *src, size_t len) { lass_stac(); __inline_memcpy(dst, src, len); lass_clac(); } static void text_poke_memset(void *dst, const void *src, size_t len) { int c = *(const int *)src; lass_stac(); __inline_memset(dst, c, len); lass_clac(); } typedef void text_poke_f(void *dst, const void *src, size_t len); static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; pgprot_t pgprot; /* * While boot memory allocator is running we cannot use struct pages as * they are not yet initialized. There is no way to recover. */ BUG_ON(!after_bootmem); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); if (cross_page_boundary) pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { pages[0] = virt_to_page(addr); WARN_ON(!PageReserved(pages[0])); if (cross_page_boundary) pages[1] = virt_to_page(addr + PAGE_SIZE); } /* * If something went wrong, crash and burn since recovery paths are not * implemented. */ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. */ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); /* * The lock is not really needed, but this allows to avoid open-coding. */ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). */ VM_BUG_ON(!ptep); local_irq_save(flags); pte = mk_pte(pages[0], pgprot); set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* * Ensure that the PTE is only cleared after the instructions of memcpy * were issued by using a compiler barrier. */ barrier(); pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); if (func == text_poke_memcpy) { /* * If the text does not match what we just wrote then something is * fundamentally screwy; there's nothing we can really do about that. */ BUG_ON(memcmp(addr, src, len)); } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); return addr; } /** * text_poke - Update instructions on a live kernel * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); return __text_poke(text_poke_memcpy, addr, opcode, len); } /** * text_poke_kgdb - Update instructions on a live kernel by kgdb * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Context: should only be used by kgdb, which ensures no other core is running, * despite the fact it does not hold the text_mutex. */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { return __text_poke(text_poke_memcpy, addr, opcode, len); } void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } return addr; } /** * text_poke_copy - Copy instructions into (an unused part of) RX memory * @addr: address to modify * @opcode: source of the copy * @len: length to copy, could be more than 2x PAGE_SIZE * * Not safe against concurrent execution; useful for JITs to dump * new code blocks into unused regions of RX memory. Can be used in * conjunction with synchronize_rcu_tasks() to wait for existing * execution to quiesce after having made sure no existing functions * pointers are live. */ void *text_poke_copy(void *addr, const void *opcode, size_t len) { mutex_lock(&text_mutex); addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } /** * text_poke_set - memset into (an unused part of) RX memory * @addr: address to modify * @c: the byte to fill the area with * @len: length to copy, could be more than 2x PAGE_SIZE * * This is useful to overwrite unused regions of RX memory with illegal * instructions. */ void *text_poke_set(void *addr, int c, size_t len) { unsigned long start = (unsigned long)addr; size_t patched = 0; if (WARN_ON_ONCE(core_kernel_text(start))) return NULL; mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); return addr; } static void do_sync_core(void *info) { sync_core(); } void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } /* * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; /* see smp_text_poke_batch_finish() */ u8 old; }; #define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) static struct smp_text_poke_array { struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; } text_poke_array; static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); /* * These four __always_inline annotations imply noinstr, necessary * due to smp_text_poke_int3_handler() being noinstr: */ static __always_inline bool try_get_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); if (!raw_atomic_inc_not_zero(refs)) return false; return true; } static __always_inline void put_text_poke_array(void) { atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); raw_atomic_dec(refs); } static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { return _stext + tpl->rel_addr; } static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { if (tpl_a < text_poke_addr(tpl_b)) return -1; if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { struct smp_text_poke_loc *tpl; int ret = 0; void *ip; if (user_mode(regs)) return 0; /* * Having observed our INT3 instruction, we now must observe * text_poke_array with non-zero refcount: * * text_poke_array_refs = 1 INT3 * WMB RMB * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); if (!try_get_text_poke_array()) return 0; /* * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ if (unlikely(text_poke_array.nr_entries > 1)) { tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, sizeof(struct smp_text_poke_loc), patch_cmp); if (!tpl) goto out_put; } else { tpl = text_poke_array.vec; if (text_poke_addr(tpl) != ip) goto out_put; } ip += tpl->len; switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, * do not consume. */ goto out_put; case RET_INSN_OPCODE: int3_emulate_ret(regs); break; case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: BUG(); } ret = 1; out_put: put_text_poke_array(); return ret; } /** * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * * Input state: * text_poke_array.vec: vector of instructions to patch * text_poke_array.nr_entries: number of entries in the vector * * Modify multi-byte instructions by using INT3 breakpoints on SMP. * We completely avoid using stop_machine() here, and achieve the * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: * - add an INT3 trap to the address that will be patched * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range * - SMP sync all CPUs * - For each entry in the vector: * - replace the first byte (INT3) by the first byte of the * replacing opcode * - SMP sync all CPUs */ void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; if (!text_poke_array.nr_entries) return; lockdep_assert_held(&text_mutex); /* * Corresponds to the implicit memory barrier in try_get_text_poke_array() to * ensure reading a non-zero refcount provides up to date text_poke_array data. */ for_each_possible_cpu(i) atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be * updated. This can take quite some time, and with full kernel debugging * enabled, this could cause the softlockup watchdog to trigger. * This function gets called every 256 entries added to be patched. * Call cond_resched() here to make sure that other tasks can get scheduled * while processing all the functions being patched. */ cond_resched(); /* * Corresponding read barrier in INT3 notifier for making sure the * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* * First step: add a INT3 trap to the address that will be patched. */ for (i = 0; i < text_poke_array.nr_entries; i++) { text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; const u8 *new = text_poke_array.vec[i].text; int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { _new[0] = 0x0f; memcpy(_new + 1, new, 5); new = _new; } text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } /* * Emit a perf event to record the text poke, primarily to * support Intel PT decoding which must walk the executable code * to reconstruct the trace. The flow up to here is: * - write INT3 byte * - IPI-SYNC * - write instruction tail * At this point the actual control flow will be through the * INT3 and handler and not hit the old or new instruction. * Intel PT outputs FUP/TIP packets for the INT3, so the flow * can still be decoded. Subsequently: * - emit RECORD_TEXT_POKE with the new instruction * - IPI-SYNC * - write first byte * - IPI-SYNC * So before the text poke event timestamp, the decoder will see * either the old instruction flow or FUP/TIP of INT3. After the * text poke event timestamp, the decoder will see either the * new instruction flow or FUP/TIP of INT3. Thus decoders can * use the timestamp as the point at which to modify the * executable code. * The old instruction is recorded so that the event can be * processed forwards or backwards. */ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ smp_text_poke_sync_each_cpu(); } /* * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { u8 byte = text_poke_array.vec[i].text[0]; if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. * * Notably, if after step-3 above the INT3 got removed, then the * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 * handlers and the below spin-wait will not happen. * * IOW. unless the replacement instruction is INT3, this case goes * unused. */ for_each_possible_cpu(i) { atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); if (unlikely(!atomic_dec_and_test(refs))) atomic_cond_read_acquire(refs, !VAL); } /* They are all completed: */ text_poke_array.nr_entries = 0; } static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; if (len == 6) i = 1; memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); tpl->rel_addr = addr - (void *)_stext; tpl->len = len; tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ tpl->opcode = insn.opcode.bytes[1] - 0x10; } switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: /* * Control flow instructions without implied execution of the * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tpl->opcode = JMP8_INSN_OPCODE; tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tpl->opcode = JMP32_INSN_OPCODE; tpl->disp = 0; break; default: /* unknown instruction */ BUG(); } break; } } /* * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ static bool text_poke_addr_ordered(void *addr) { WARN_ON_ONCE(!addr); if (!text_poke_array.nr_entries) return true; /* * If the last current entry's address is higher than the * new entry's address we'd like to add, then ordering * is violated and we must first flush all pending patching * requests: */ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) return false; return true; } /** * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Add a new instruction to the current queue of to-be-patched instructions * the kernel maintains. The patching request will not be executed immediately, * but becomes part of an array of patching requests, optimized for batched * execution. All pending patching requests will be executed on the next * smp_text_poke_batch_finish() call. */ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) smp_text_poke_batch_finish(); __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is * not possible to allocate memory for a vector. The single instruction * is patched in immediately. */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); }
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef RDMA_CORE_H #define RDMA_CORE_H #include <linux/idr.h> #include <rdma/uverbs_types.h> #include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> struct ib_uverbs_device; void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason); int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); /* * Get an ib_uobject that corresponds to the given id from ufile, assuming * the object is from the given type. Lock it to the required access when * applicable. * This function could create (access == NEW), destroy (access == DESTROY) * or unlock (access == READ || access == WRITE) objects if required. * The action will be finalized only when uverbs_finalize_object or * uverbs_finalize_objects are called. */ struct ib_uobject * uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, s64 id, struct uverbs_attr_bundle *attrs); void uverbs_finalize_object(struct ib_uobject *uobj, enum uverbs_obj_access access, bool hw_obj_valid, bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs); /* * This is the runtime description of the uverbs API, used by the syscall * machinery to validate and dispatch calls. */ /* * Depending on ID the slot pointer in the radix tree points at one of these * structs. */ struct uverbs_api_ioctl_method { int(__rcu *handler)(struct uverbs_attr_bundle *attrs); DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN); u16 bundle_size; u8 use_stack:1; u8 driver_method:1; u8 disabled:1; u8 has_udata:1; u8 key_bitmap_len; u8 destroy_bkey; }; struct uverbs_api_write_method { int (*handler)(struct uverbs_attr_bundle *attrs); u8 disabled:1; u8 is_ex:1; u8 has_udata:1; u8 has_resp:1; u8 req_size; u8 resp_size; }; struct uverbs_api_attr { struct uverbs_attr_spec spec; }; struct uverbs_api { /* radix tree contains struct uverbs_api_* pointers */ struct radix_tree_root radix; enum rdma_driver_id driver_id; unsigned int num_write; unsigned int num_write_ex; struct uverbs_api_write_method notsupp_method; const struct uverbs_api_write_method **write_methods; const struct uverbs_api_write_method **write_ex_methods; }; /* * Get an uverbs_api_object that corresponds to the given object_id. * Note: * -ENOMSG means that any object is allowed to match during lookup. */ static inline const struct uverbs_api_object * uapi_get_object(struct uverbs_api *uapi, u16 object_id) { const struct uverbs_api_object *res; if (object_id == UVERBS_IDR_ANY_OBJECT) return ERR_PTR(-ENOMSG); res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id)); if (!res) return ERR_PTR(-ENOENT); return res; } char *uapi_key_format(char *S, unsigned int key); struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev); void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev); void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_dmabuf[]; extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; extern const struct uapi_definition uverbs_def_obj_qp[]; extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * uapi_get_method(const struct uverbs_api *uapi, u32 command) { u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK; if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED | IB_USER_VERBS_CMD_COMMAND_MASK)) return ERR_PTR(-EINVAL); if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) { if (cmd_idx >= uapi->num_write_ex) return ERR_PTR(-EOPNOTSUPP); return uapi->write_ex_methods[cmd_idx]; } if (cmd_idx >= uapi->num_write) return ERR_PTR(-EOPNOTSUPP); return uapi->write_methods[cmd_idx]; } void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata, unsigned int attr_in, unsigned int attr_out); #endif /* RDMA_CORE_H */
447 577 580 577 4768 4621 665 803 655 4764 4623 4630 225 4776 671 671 800 4774 6 6 3677 9 3677 465 464 463 448 464 464 579 579 577 577 6 446 451 448 6 447 448 674 447 675 4764 675 4774 4759 6 213 3675 3990 506 134 465 135 134 16 15 221 221 9 671 672 655 671 4623 4636 4634 4583 3534 4638 4638 4614 4626 1 6 6 6 6 6 6 213 472 448 671 672 669 671 671 8 8 8 8 8 9 9 9 9 9 9 9 9 9 8 8 664 212 212 212 212 657 447 212 9 6 14 663 673 663 9 6 675 6 671 664 662 8 672 237 236 7 231 212 19 225 221 6 217 6 213 217 217 217 217 217 217 217 132 16 115 119 118 119 119 119 16 151 152 133 135 136 151 118 150 152 151 16 465 16 466 466 467 465 466 465 466 467 463 16 1579 1581 1583 2 462 463 1 1582 467 1579 462 1583 1586 1582 466 461 213 213 213 213 213 212 213 213 213 579 578 656 213 217 217 6 213 213 213 213 213 448 447 448 447 449 16 16 16 457 675 8 16 16 16 16 159 8 150 16 16 158 8 150 465 8 8 8 471 466 464 471 475 8 8 8 8 8 8 225 8 8 8 8 8 217 217 8 448 448 448 448 446 448 449 448 449 448 448 449 449 447 4298 4294 4291 4300 14 1223 4297 14 4294 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> #define TP_FCT tracepoint_string(__func__) /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_PREALLOC 1 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_return_sheaf(struct slab_sheaf *sheaf) { kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); } static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) { return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); } static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, unsigned int size) { return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); kfree_rcu(node, rcu); } static void mt_set_height(struct maple_tree *mt, unsigned char height) { unsigned int new_flags = mt->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX); new_flags |= height << MT_FLAGS_HEIGHT_OFFSET; mt->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Node types: * 0b??1 = Root * 0b?00 = 16 bit nodes * 0b010 = 32 bit nodes * 0b110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x02 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *node; node = mte_to_node(enode); return ma_dead_node(node); } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min for the parent node of mas->node. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; /* * !mas->offset implies that parent node min == mas->min. * mas->offset > 0 implies that we need to walk up to find the * implied pivot min. */ if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_node *ret; if (mas->alloc) { ret = mas->alloc; mas->alloc = NULL; goto out; } if (WARN_ON_ONCE(!mas->sheaf)) return NULL; ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); out: memset(ret, 0, sizeof(*ret)); return ret; } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { if (!mas->node_request) return; if (mas->node_request == 1) { if (mas->sheaf) goto use_sheaf; if (mas->alloc) return; mas->alloc = mt_alloc_one(gfp); if (!mas->alloc) goto error; mas->node_request = 0; return; } use_sheaf: if (unlikely(mas->alloc)) { kfree(mas->alloc); mas->alloc = NULL; } if (mas->sheaf) { unsigned long refill; refill = mas->node_request; if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { mas->node_request = 0; return; } if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) goto error; mas->node_request = 0; return; } mas->sheaf = mt_get_sheaf(gfp, mas->node_request); if (likely(mas->sheaf)) { mas->node_request = 0; return; } error: mas_set_err(mas, -ENOMEM); } static inline void mas_empty_nodes(struct ma_state *mas) { mas->node_request = 0; if (mas->sheaf) { mt_return_sheaf(mas->sheaf); mas->sheaf = NULL; } if (mas->alloc) { kfree(mas->alloc); mas->alloc = NULL; } } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { ma_free_rcu(mte_to_node(used)); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == ma_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 0; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. * @new_height: if we are inserting a root node, update the height of the tree */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode, char new_height) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mt_set_height(mas->tree, new_height); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. * @new_height: The new height of the tree as a result of the operation */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode, new_height); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. * * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { *mid_split = 0; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); ma_free_rcu(tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * @new_height: The new height of the tree as a result of the operation * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode, new_height); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * @new_height: The new height of the tree as a result of the operation * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode, new_height); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; unsigned char new_height = 0; /* used if node is a new root */ struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); new_height++; /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) { if (mast_overflow(mast)) continue; if (mast->orig_l->node == mast->orig_r->node) { /* * The data in b_node should be stored in one * node and in the tree */ slot = mast->l->offset; break; } continue; } /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); new_height++; mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(TP_FCT, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned int orig_height = mas_mt_height(mas); unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(TP_FCT, mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= orig_height) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, &mast, true)) { height++; break; } /* Try to push right. */ if (mas_push_data(mas, &mast, false)) { height++; break; } split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old, height); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mt_set_height(mas->tree, 1); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; if (mas->end < mt_slots[wr_mas->type] - 1) wr_mas->vacant_height = mas->depth + 1; if (ma_is_root(mas_mn(mas))) { /* root needs more than 2 entries to be sufficient + 1 */ if (mas->end > 2) wr_mas->sufficient_height = 1; } else if (mas->end > mt_min_slots[wr_mas->type] + 1) wr_mas->sufficient_height = mas->depth + 1; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mt_set_height(mas->tree, 0); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mt_set_height(mas->tree, 1); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(TP_FCT, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); unsigned char height = mas_mt_height(mas); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode, height); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_invalid: MT_BUG_ON(mas->tree, 1); } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @wr_mas: The maple write state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; unsigned char height = mas_mt_height(mas); int ret = height * 3 + 1; unsigned char delta = height - wr_mas->vacant_height; switch (mas->store_type) { case wr_exact_fit: case wr_append: case wr_slot_store: ret = 0; break; case wr_spanning_store: if (wr_mas->sufficient_height < wr_mas->vacant_height) ret = (height - wr_mas->sufficient_height) * 3 + 1; else ret = delta * 3 + 1; break; case wr_split_store: ret = delta * 2 + 1; break; case wr_rebalance: if (wr_mas->sufficient_height < wr_mas->vacant_height) ret = (height - wr_mas->sufficient_height) * 2 + 1; else ret = delta * 2 + 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_invalid: WARN_ON_ONCE(1); } mas->node_request = ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; mas_wr_prealloc_setup(wr_mas); mas->store_type = mas_wr_store_type(wr_mas); mas_prealloc_calc(wr_mas, entry); if (!mas->node_request) return; mas_alloc_nodes(mas, GFP_NOWAIT); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) goto underflow; goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; break; } mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) && !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, error == -EBUSY) * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: kfree(node); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { mte_set_node_dead(enode); node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) kfree(node); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(TP_FCT, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } mas_prealloc_calc(&wr_mas, entry); if (!mas->node_request) goto store; mas_alloc_nodes(mas, GFP_NOWAIT); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(TP_FCT, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); mas_prealloc_calc(&wr_mas, entry); if (!mas->node_request) goto set_flag; mas->mas_flags &= ~MA_STATE_PREALLOC; mas_alloc_nodes(mas, gfp); if (mas_is_err(mas)) { int ret = xa_err(mas->node); mas->node_request = 0; mas_destroy(mas); mas_reset(mas); return ret; } set_flag: mas->mas_flags |= MA_STATE_PREALLOC; return 0; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { mas->mas_flags &= ~MA_STATE_PREALLOC; mas_empty_nodes(mas); } EXPORT_SYMBOL_GPL(mas_destroy); static void mas_may_activate(struct ma_state *mas) { if (!mas->node) { mas->status = ma_start; } else if (mas->index > mas->max || mas->index < mas->min) { mas->status = ma_start; } else { mas->status = ma_active; } } static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas_may_activate(mas); break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas_may_activate(mas); break; case ma_overflow: /* User expects mas to be one after where it is */ mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas_may_activate(mas); *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas->sheaf && !mas->alloc) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { struct kmem_cache_args args = { .align = sizeof(struct maple_node), .sheaf_capacity = 32, }; maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), &args, SLAB_PANIC); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(TP_FCT, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(TP_FCT, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(TP_FCT, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); kfree(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); count = mas->node_request = mas_data_end(mas) + 1; mas_alloc_nodes(mas, gfp); if (unlikely(mas_is_err(mas))) return; slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) | val); } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) goto empty_mas; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); empty_mas: mas_empty_nodes(mas); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(TP_FCT, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n", mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */
18 18 18 18 33 33 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <net/ip_vs.h> /* * IPVS protocols can only be registered/unregistered when the ipvs * module is loaded/unloaded, so no lock is needed in accessing the * ipvs protocol table. */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; /* States for conn templates: NONE or words separated with ",", max 15 chars */ static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { [IP_VS_CTPL_S_NONE] = "NONE", [IP_VS_CTPL_S_ASSURED] = "ASSURED", }; /* * register an ipvs protocol */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; if (pp->init != NULL) pp->init(pp); return 0; } /* * register an ipvs protocols netns related data */ static int register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc_obj(struct ip_vs_proto_data); if (!pd) return -ENOMEM; pd->pp = pp; /* For speed issues */ pd->next = ipvs->proto_data_table[hash]; ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; kfree(pd); return ret; } } return 0; } /* * unregister an ipvs protocol */ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { if (*pp_p == pp) { *pp_p = pp->next; if (pp->exit != NULL) pp->exit(pp); return 0; } } return -ESRCH; } /* * unregister an ipvs protocols netns data */ static int unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } } return -ESRCH; } /* * get ip_vs_protocol object by its proto. */ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) return pp; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ struct ip_vs_proto_data * ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) return pd; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { if (pd->pp->timeout_change) pd->pp->timeout_change(pd, flags); } } } int * ip_vs_create_timeout_table(int *table, int size) { return kmemdup(table, size, GFP_KERNEL); } const char *ip_vs_state_name(const struct ip_vs_conn *cp) { unsigned int state = cp->state; struct ip_vs_protocol *pp; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { if (state >= IP_VS_CTPL_S_LAST) return "ERR!"; return ip_vs_ctpl_state_name_table[state] ? : "?"; } pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; return pp->state_name(state); } static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[128]; struct iphdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI4->%pI4", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI4:%u->%pI4:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #ifdef CONFIG_IP_VS_IPV6 static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[192]; struct ipv6hdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #endif void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } /* * per network name-space init */ int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP &ip_vs_protocol_esp, #endif }; for (i = 0; i < ARRAY_SIZE(protos); i++) { ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: ip_vs_protocol_net_cleanup(ipvs); return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) unregister_ip_vs_proto_netns(ipvs, pd); } } int __init ip_vs_protocol_init(void) { char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ strcat(protocols, ", "); \ strcat(protocols, (p)->name); \ } while (0) #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif #ifdef CONFIG_IP_VS_PROTO_UDP REGISTER_PROTOCOL(&ip_vs_protocol_udp); #endif #ifdef CONFIG_IP_VS_PROTO_SCTP REGISTER_PROTOCOL(&ip_vs_protocol_sctp); #endif #ifdef CONFIG_IP_VS_PROTO_AH REGISTER_PROTOCOL(&ip_vs_protocol_ah); #endif #ifdef CONFIG_IP_VS_PROTO_ESP REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); return 0; } void ip_vs_protocol_cleanup(void) { struct ip_vs_protocol *pp; int i; /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) unregister_ip_vs_protocol(pp); } }
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 7 7 7 7 7 7 6 7 7 7 7 5 5 5 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct list_head dispatch; struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } static struct request *dd_start_request(struct deadline_data *dd, enum dd_data_dir data_dir, struct request *rq) { u8 ioprio_class = dd_rq_ioclass(rq); enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; lockdep_assert_held(&dd->lock); /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); return dd_start_request(dd, data_dir, rq); } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); if (!list_empty(&dd->dispatch)) { rq = list_first_entry(&dd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); dd_start_request(dd, rq_data_dir(rq), rq); goto unlock; } rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { if (!blk_mq_is_sync_read(opf)) data->shallow_depth = data->q->async_depth; } /* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct request_queue *q) { blk_mq_set_min_shallow_depth(q, q->async_depth); } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; enum dd_prio prio; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return -ENOMEM; eq->elevator_data = dd; INIT_LIST_HEAD(&dd->dispatch); for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; q->async_depth = q->nr_requests; dd_depth_updated(q); return 0; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) per_prio->stats.inserted++; rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &dd->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (per_prio) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; if (!list_empty_careful(&dd->dispatch)) return true; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&dd->lock) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock); return seq_list_start(&dd->dispatch, *pos); } static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; return seq_list_next(v, &dd->dispatch, pos); } static void deadline_dispatch_stop(struct seq_file *m, void *v) __releases(&dd->lock) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; spin_unlock(&dd->lock); } static const struct seq_operations deadline_dispatch_seq_ops = { .start = deadline_dispatch_start, .next = deadline_dispatch_next, .stop = deadline_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
7 31 7 7 7 7 22 7 25 2 1 22 7 7 7 407 6 405 554 557 556 411 9 304 383 7 130 556 556 301 31 1 3 26 18 31 265 4 4 160 122 4 10 10 7 7 359 2 5 11 4 1 301 275 26 82 49 40 69 21 32 62 27 49 87 15 100 67 69 19 9 8 120 51 41 35 30 27 4 138 213 97 3 95 95 6 80 17 63 36 95 17 81 7 48 3 53 62 38 76 22 89 7 89 8 5 95 1 93 95 95 95 64 81 16 64 36 85 9 82 6 86 12 1 2 81 28 2 81 81 23 72 27 50 46 65 30 82 13 84 11 88 7 88 7 88 29 5 39 23 38 23 28 4 13 2 6 38 6 38 57 27 77 5 5 77 70 9 3 10 2 32 51 45 36 2 34 13 20 17 12 9 20 29 23 6 13 16 4 12 12 5 1 2 2 10 42 103 61 2 8 62 62 109 111 45 51 50 1 2 50 51 13 2 2 9 8 13 1 2 10 9 2 2 2 2 2 2 15 1 1 13 2 25 25 16 18 18 6 16 3 4 11 11 11 20 20 27 26 2 11 6 9 6 2 1 2 1 2 3 7 7 38 2 37 35 17 3 14 15 19 45 25 7 38 183 183 183 183 183 182 183 29 161 176 82 21 77 82 70 21 21 158 24 1 149 51 2 1 1 1 45 35 10 45 26 20 38 4 42 44 2 46 5 16 14 2 1 1 10 7 3 10 6 4 7 6 4 4 10 20 3 8 3 2 3 9 23 2 1 1 12 6 15 6 9 15 9 6 1 14 3 1 1 1 9 4 8 4 11 11 11 11 10 8 8 5 383 344 343 5 6 5 2 9 12 9 9 9 9 12 12 7 7 7 12 9 9 9 6 9 12 12 7 7 7 4 12 5 1 216 216 137 69 6 70 16 190 190 175 174 3 10 99 98 76 31 99 41 26 9 12 41 41 41 41 26 9 12 41 35 17 18 1 1 1 1 13 2 2 2 14 14 4 12 15 11 4 15 5 14 15 14 14 7 14 12 12 7 407 1 1 407 18 384 371 8 2 1 1 4 5 1 4 4 4 1 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/key/af_key.c An implementation of PF_KEYv2 sockets. * * Authors: Maxim Giryaev <gem@asplinux.ru> * David S. Miller <davem@redhat.com> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org> * Derek Atkins <derek@ihtfp.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/xfrm.h> #include <net/sock.h> #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; atomic_t socks_nr; }; static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; int registered; int promisc; struct { uint8_t msg_version; uint32_t msg_portid; int (*dump)(struct pfkey_sock *sk); void (*done)(struct pfkey_sock *sk); union { struct xfrm_policy_walk policy; struct xfrm_state_walk state; } u; struct sk_buff *skb; } dump; struct mutex dump_lock; }; static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family); static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; } static int pfkey_can_dump(const struct sock *sk) { if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) return 1; return 0; } static void pfkey_terminate_dump(struct pfkey_sock *pfk) { if (pfk->dump.dump) { if (pfk->dump.skb) { kfree_skb(pfk->dump.skb); pfk->dump.skb = NULL; } pfk->dump.done(pfk); pfk->dump.dump = NULL; pfk->dump.done = NULL; } } static void pfkey_sock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_terminate_dump(pfkey_sk(sk)); skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive pfkey socket: %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); atomic_dec(&net_pfkey->socks_nr); } static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); mutex_lock(&pfkey_mutex); sk_add_node_rcu(sk, &net_pfkey->table); mutex_unlock(&pfkey_mutex); } static void pfkey_remove(struct sock *sk) { mutex_lock(&pfkey_mutex); sk_del_node_init_rcu(sk); mutex_unlock(&pfkey_mutex); } static struct proto key_proto = { .name = "KEY", .owner = THIS_MODULE, .obj_size = sizeof(struct pfkey_sock), }; static int pfkey_create(struct net *net, struct socket *sock, int protocol, int kern) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); sock->ops = &pfkey_ops; sock_init_data(sock, sk); sk->sk_family = PF_KEY; sk->sk_destruct = pfkey_sock_destruct; atomic_inc(&net_pfkey->socks_nr); pfkey_insert(sk); return 0; } static int pfkey_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; pfkey_remove(sk); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_write_queue); synchronize_rcu(); sock_put(sk); return 0; } static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, struct sock *sk) { int err = -ENOBUFS; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return err; skb = skb_clone(skb, allocation); if (skb) { skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); err = 0; } return err; } /* Send SKB to all pfkey sockets matching selected criteria. */ #define BROADCAST_ALL 0 #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think * XXX PF_KEY socket apps will not mind current behavior. */ if (!skb) return -ENOMEM; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; /* Yes, it means that if you are meant to receive this * pfkey message you receive it twice as promiscuous * socket. */ if (pfk->promisc) pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) continue; if (broadcast_flags != BROADCAST_ALL) { if (broadcast_flags & BROADCAST_PROMISC_ONLY) continue; if ((broadcast_flags & BROADCAST_REGISTERED) && !pfk->registered) continue; if (broadcast_flags & BROADCAST_ONE) continue; } err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; } rcu_read_unlock(); if (one_sk != NULL) err = pfkey_broadcast_one(skb, allocation, one_sk); kfree_skb(skb); return err; } static int pfkey_do_dump(struct pfkey_sock *pfk) { struct sadb_msg *hdr; int rc; mutex_lock(&pfk->dump_lock); if (!pfk->dump.dump) { rc = 0; goto out; } rc = pfk->dump.dump(pfk); if (rc == -ENOBUFS) { rc = 0; goto out; } if (pfk->dump.skb) { if (!pfkey_can_dump(&pfk->sk)) { rc = 0; goto out; } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } pfkey_terminate_dump(pfk); out: mutex_unlock(&pfk->dump_lock); return rc; } static inline void pfkey_hdr_dup(struct sadb_msg *new, const struct sadb_msg *orig) { *new = *orig; } static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) { struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); struct sadb_msg *hdr; if (!skb) return -ENOBUFS; /* Woe be to the platform trying to support PFKEY yet * having normal errnos outside the 1-255 range, inclusive. */ err = -err; if (err == ERESTARTSYS || err == ERESTARTNOHAND || err == ERESTARTNOINTR) err = EINTR; if (err >= 512) err = EINVAL; BUG_ON(err <= 0 || err >= 256); hdr = skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = (uint8_t) err; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ static int verify_address_len(const void *p) { const struct sadb_address *sp = p; const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); const struct sockaddr_in *sin; #if IS_ENABLED(CONFIG_IPV6) const struct sockaddr_in6 *sin6; #endif int len; if (sp->sadb_address_len < DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), sizeof(uint64_t))) return -EINVAL; switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 32) return -EINVAL; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 128) return -EINVAL; break; #endif default: /* It is user using kernel to keep track of security * associations for another protocol, such as * OSPF/RSVP/RIPV2/MIP. It is user's job to verify * lengths. * * XXX Actually, association/policy database is not yet * XXX able to cope with arbitrary sockaddr families. * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; } return 0; } static inline int sadb_key_len(const struct sadb_key *key) { int key_bytes = DIV_ROUND_UP(key->sadb_key_bits, 8); return DIV_ROUND_UP(sizeof(struct sadb_key) + key_bytes, sizeof(uint64_t)); } static int verify_key_len(const void *p) { const struct sadb_key *key = p; if (sadb_key_len(key) > key->sadb_key_len) return -EINVAL; return 0; } static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) { return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + sec_ctx->sadb_x_ctx_len, sizeof(uint64_t)); } static inline int verify_sec_ctx_len(const void *p) { const struct sadb_x_sec_ctx *sec_ctx = p; int len = sec_ctx->sadb_x_ctx_len; if (len > PAGE_SIZE) return -EINVAL; len = pfkey_sec_ctx_len(sec_ctx); if (sec_ctx->sadb_x_sec_len != len) return -EINVAL; return 0; } static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; uctx->len = pfkey_sec_ctx_len(sec_ctx); uctx->exttype = sec_ctx->sadb_x_sec_exttype; uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; uctx->ctx_len = sec_ctx->sadb_x_ctx_len; memcpy(uctx + 1, sec_ctx + 1, uctx->ctx_len); return uctx; } static int present_and_same_family(const struct sadb_address *src, const struct sadb_address *dst) { const struct sockaddr *s_addr, *d_addr; if (!src || !dst) return 0; s_addr = (const struct sockaddr *)(src + 1); d_addr = (const struct sockaddr *)(dst + 1); if (s_addr->sa_family != d_addr->sa_family) return 0; if (s_addr->sa_family != AF_INET #if IS_ENABLED(CONFIG_IPV6) && s_addr->sa_family != AF_INET6 #endif ) return 0; return 1; } static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) { const char *p = (char *) hdr; int len = skb->len; len -= sizeof(*hdr); p += sizeof(*hdr); while (len > 0) { const struct sadb_ext *ehdr = (const struct sadb_ext *) p; uint16_t ext_type; int ext_len; if (len < sizeof(*ehdr)) return -EINVAL; ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; if (ext_len < sizeof(uint64_t) || ext_len > len || ext_type == SADB_EXT_RESERVED) return -EINVAL; if (ext_type <= SADB_EXT_MAX) { int min = (int) sadb_ext_min_len[ext_type]; if (ext_len < min) return -EINVAL; if (ext_hdrs[ext_type-1] != NULL) return -EINVAL; switch (ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_X_EXT_NAT_T_OA: if (verify_address_len(p)) return -EINVAL; break; case SADB_X_EXT_SEC_CTX: if (verify_sec_ctx_len(p)) return -EINVAL; break; case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: if (verify_key_len(p)) return -EINVAL; break; default: break; } ext_hdrs[ext_type-1] = (void *) p; } p += ext_len; len -= ext_len; } return 0; } static uint16_t pfkey_satype2proto(uint8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; default: return 0; } /* NOTREACHED */ } static uint8_t pfkey_proto2satype(uint16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; default: return 0; } /* NOTREACHED */ } /* BTW, this scheme means that there is no way with PFKEY2 sockets to * say specifically 'just raw sockets' as we encode them as 255. */ static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) { switch (family) { case AF_INET: return sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return sizeof(struct sockaddr_in6); #endif } return 0; } static int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) { switch (sa->sa_family) { case AF_INET: xaddr->a4 = ((struct sockaddr_in *)sa)->sin_addr.s_addr; return AF_INET; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: memcpy(xaddr->a6, &((struct sockaddr_in6 *)sa)->sin6_addr, sizeof(struct in6_addr)); return AF_INET6; #endif } return 0; } static int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) { return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), xaddr); } static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { const struct sadb_sa *sa; const struct sadb_address *addr; uint16_t proto; unsigned short family; xfrm_address_t *xaddr; sa = ext_hdrs[SADB_EXT_SA - 1]; if (sa == NULL) return NULL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return NULL; /* sadb_address_len should be checked by caller */ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; if (addr == NULL) return NULL; family = ((const struct sockaddr *)(addr + 1))->sa_family; switch (family) { case AF_INET: xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; break; #endif default: xaddr = NULL; } if (!xaddr) return NULL; return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); } #define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) static int pfkey_sockaddr_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); } static inline int pfkey_mode_from_xfrm(int mode) { switch(mode) { case XFRM_MODE_TRANSPORT: return IPSEC_MODE_TRANSPORT; case XFRM_MODE_TUNNEL: return IPSEC_MODE_TUNNEL; case XFRM_MODE_BEET: return IPSEC_MODE_BEET; default: return -1; } } static inline int pfkey_mode_to_xfrm(int mode) { switch(mode) { case IPSEC_MODE_ANY: /*XXX*/ case IPSEC_MODE_TRANSPORT: return XFRM_MODE_TRANSPORT; case IPSEC_MODE_TUNNEL: return XFRM_MODE_TUNNEL; case IPSEC_MODE_BEET: return XFRM_MODE_BEET; default: return -1; } } static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, struct sockaddr *sa, unsigned short family) { switch (family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; sin->sin_family = AF_INET; sin->sin_port = port; sin->sin_addr.s_addr = xaddr->a4; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return 32; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } #endif } return 0; } static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_lifetime *lifetime; struct sadb_address *addr; struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int size; int auth_key_size = 0; int encrypt_key_size = 0; int sockaddr_size; struct xfrm_encap_tmpl *natt = NULL; int mode; /* address family check */ sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return ERR_PTR(-EINVAL); /* base, SA, (lifetime (HSC),) address(SD), (address(P),) key(AE), (identity(SD),) (sensitivity)> */ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + sizeof(struct sadb_lifetime) + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } /* identity & sensitivity */ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { if (x->aalg && x->aalg->alg_key_len) { auth_key_size = PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); size += sizeof(struct sadb_key) + auth_key_size; } if (x->ealg && x->ealg->alg_key_len) { encrypt_key_size = PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); size += sizeof(struct sadb_key) + encrypt_key_size; } } if (x->encap) natt = x->encap; if (natt && natt->encap_type) { size += sizeof(struct sadb_x_nat_t_type); size += sizeof(struct sadb_x_nat_t_port); size += sizeof(struct sadb_x_nat_t_port); } skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ hdr->sadb_msg_len = size / sizeof(uint64_t); /* sa */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = x->props.replay_window; switch (x->km.state) { case XFRM_STATE_VALID: sa->sadb_sa_state = x->km.dying ? SADB_SASTATE_DYING : SADB_SASTATE_MATURE; break; case XFRM_STATE_ACQ: sa->sadb_sa_state = SADB_SASTATE_LARVAL; break; default: sa->sadb_sa_state = SADB_SASTATE_DEAD; break; } sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); sa->sadb_sa_auth = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; if (x->props.flags & XFRM_STATE_NOECN) sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; if (x->props.flags & XFRM_STATE_DECAP_DSCP) sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; if (x->props.flags & XFRM_STATE_NOPMTUDISC) sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; /* hard time */ if (hsc & 2) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; } /* soft time */ if (hsc & 1) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; } /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = x->curlft.packets; lifetime->sadb_lifetime_bytes = x->curlft.bytes; lifetime->sadb_lifetime_addtime = x->curlft.add_time; lifetime->sadb_lifetime_usetime = x->curlft.use_time; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; /* "if the ports are non-zero, then the sadb_address_proto field, normally zero, MUST be filled in with the transport protocol's number." - RFC2367 */ addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; addr->sadb_address_proto = pfkey_proto_from_xfrm(x->sel.proto); addr->sadb_address_prefixlen = x->sel.prefixlen_s; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, (struct sockaddr *) (addr + 1), x->props.family); } /* auth key */ if (add_keys && auth_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + auth_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_AUTH; key->sadb_key_bits = x->aalg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); } /* encrypt key */ if (add_keys && encrypt_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + encrypt_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + encrypt_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; key->sadb_key_bits = x->ealg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->ealg->alg_key, (x->ealg->alg_key_len+7)/8); } /* sa */ sa2 = skb_put(skb, sizeof(struct sadb_x_sa2)); sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } sa2->sadb_x_sa2_mode = mode; sa2->sadb_x_sa2_reserved1 = 0; sa2->sadb_x_sa2_reserved2 = 0; sa2->sadb_x_sa2_sequence = 0; sa2->sadb_x_sa2_reqid = x->props.reqid; if (natt && natt->encap_type) { struct sadb_x_nat_t_type *n_type; struct sadb_x_nat_t_port *n_port; /* type */ n_type = skb_put(skb, sizeof(*n_type)); n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; n_type->sadb_x_nat_t_type_type = natt->encap_type; n_type->sadb_x_nat_t_type_reserved[0] = 0; n_type->sadb_x_nat_t_type_reserved[1] = 0; n_type->sadb_x_nat_t_type_reserved[2] = 0; /* source port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* dest port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = natt->encap_dport; n_port->sadb_x_nat_t_port_reserved = 0; } /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) { struct sk_buff *skb; skb = __pfkey_xfrm_state2msg(x, 1, 3); return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, int hsc) { return __pfkey_xfrm_state2msg(x, 0, hsc); } static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct xfrm_state *x; const struct sadb_lifetime *lifetime; const struct sadb_sa *sa; const struct sadb_key *key; const struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; sa = ext_hdrs[SADB_EXT_SA - 1]; if (!sa || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_AH && !ext_hdrs[SADB_EXT_KEY_AUTH-1]) return ERR_PTR(-EINVAL); if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) return ERR_PTR(-EINVAL); proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return ERR_PTR(-EINVAL); /* default error is no buffer space */ err = -ENOBUFS; /* RFC2367: Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. Therefore, the sadb_sa_state field of all submitted SAs MUST be SADB_SASTATE_MATURE and the kernel MUST return an error if this is not true. However, KAME setkey always uses SADB_SASTATE_LARVAL. Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. */ if (sa->sadb_sa_auth > SADB_AALG_MAX || (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || sa->sadb_sa_encrypt > SADB_EALG_MAX) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (key != NULL && sa->sadb_sa_auth != SADB_X_AALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key != NULL && sa->sadb_sa_encrypt != SADB_EALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); x = xfrm_state_alloc(net); if (x == NULL) return ERR_PTR(-ENOBUFS); x->id.proto = proto; x->id.spi = sa->sadb_sa_spi; x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay, (sizeof(x->replay.bitmap) * 8)); if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) x->props.flags |= XFRM_STATE_NOECN; if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) x->props.flags |= XFRM_STATE_DECAP_DSCP; if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) x->props.flags |= XFRM_STATE_NOPMTUDISC; lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1]; if (lifetime != NULL) { x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1]; if (lifetime != NULL) { x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) goto out; } err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); if (!x->aalg) { err = -ENOMEM; goto out; } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { x->aalg->alg_key_len = key->sadb_key_bits; memcpy(x->aalg->alg_key, key+1, keysize); } x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; x->props.aalgo = sa->sadb_sa_auth; /* x->algo.flags = sa->sadb_sa_flags; */ } if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } x->calg = kmalloc_obj(*x->calg); if (!x->calg) { err = -ENOMEM; goto out; } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); if (!x->ealg) { err = -ENOMEM; goto out; } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { x->ealg->alg_key_len = key->sadb_key_bits; memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); if (ext_hdrs[SADB_X_EXT_SA2-1]) { const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) { err = -EINVAL; goto out; } x->props.mode = mode; x->props.reqid = sa2->sadb_x_sa2_reqid; } if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; /* Nobody uses this, but we try. */ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); x->sel.prefixlen_s = addr->sadb_address_prefixlen; } if (!x->sel.family) x->sel.family = x->props.family; if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; x->encap = kzalloc_obj(*x->encap); if (!x->encap) { err = -ENOMEM; goto out; } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; natt->encap_sport = n_port->sadb_x_nat_t_port_port; } if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } } err = xfrm_init_state(x); if (err) goto out; x->km.seq = hdr->sadb_msg_seq; return x; out: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); return ERR_PTR(err); } static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -EOPNOTSUPP; } static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct sk_buff *resp_skb; struct sadb_x_sa2 *sa2; struct sadb_address *saddr, *daddr; struct sadb_msg *out_hdr; struct sadb_spirange *range; struct xfrm_state *x = NULL; int mode; int err; u32 min_spi, max_spi; u32 reqid; u8 proto; unsigned short family; xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) return -EINVAL; reqid = sa2->sadb_x_sa2_reqid; } else { mode = 0; reqid = 0; } saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; family = ((struct sockaddr *)(saddr + 1))->sa_family; switch (family) { case AF_INET: xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; break; #endif } if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; min_spi = 0x100; max_spi = 0x0fffffff; range = ext_hdrs[SADB_EXT_SPIRANGE-1]; if (range) { min_spi = range->sadb_spirange_min; max_spi = range->sadb_spirange_max; } err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { xfrm_state_put(x); return PTR_ERR(resp_skb); } out_hdr = (struct sadb_msg *) resp_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GETSPI; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; xfrm_state_put(x); pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) return -EOPNOTSUPP; if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; } static inline int event2poltype(int event) { switch (event) { case XFRM_MSG_DELPOLICY: return SADB_X_SPDDELETE; case XFRM_MSG_NEWPOLICY: return SADB_X_SPDADD; case XFRM_MSG_UPDPOLICY: return SADB_X_SPDUPDATE; case XFRM_MSG_POLEXPIRE: // return SADB_X_SPDEXPIRE; default: pr_err("pfkey: Unknown policy event %d\n", event); break; } return 0; } static inline int event2keytype(int event) { switch (event) { case XFRM_MSG_DELSA: return SADB_DELETE; case XFRM_MSG_NEWSA: return SADB_ADD; case XFRM_MSG_UPDSA: return SADB_UPDATE; case XFRM_MSG_EXPIRE: return SADB_EXPIRE; default: pr_err("pfkey: Unknown SA event %d\n", event); break; } return 0; } /* ADD/UPD/DEL */ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = pfkey_xfrm_state2msg(x); if (IS_ERR(skb)) return PTR_ERR(skb); hdr = (struct sadb_msg *) skb->data; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = event2keytype(c->event); hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; int err; struct km_event c; x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); if (IS_ERR(x)) return PTR_ERR(x); xfrm_state_hold(x); if (hdr->sadb_msg_type == SADB_ADD) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); goto out; } if (hdr->sadb_msg_type == SADB_ADD) c.event = XFRM_MSG_NEWSA; else c.event = XFRM_MSG_UPDSA; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; struct km_event c; int err; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; if ((err = security_xfrm_state_delete(x))) goto out; if (xfrm_state_kern(x)) { err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); __u8 proto; struct sk_buff *out_skb; struct sadb_msg *out_hdr; struct xfrm_state *x; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; out_skb = pfkey_xfrm_state2msg(x); proto = x->id.proto; xfrm_state_put(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GET; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, gfp_t allocation) { struct sk_buff *skb; struct sadb_msg *hdr; int len, auth_len, enc_len, i; auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); } len = enc_len + auth_len + sizeof(struct sadb_msg); skb = alloc_skb(len + 16, allocation); if (!skb) goto out_put_algs; hdr = skb_put(skb, sizeof(*hdr)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = 0; hdr->sadb_msg_len = len / sizeof(uint64_t); if (auth_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, auth_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = auth_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; for (i = 0; ; i++) { struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg->available) *ap++ = aalg->desc; } } if (enc_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, enc_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = enc_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; for (i = 0; ; i++) { struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (ealg->available) *ap++ = ealg->desc; } } out_put_algs: return skb; } static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *supp_skb; if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) return -EINVAL; if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { if (pfk->registered&(1<<hdr->sadb_msg_satype)) return -EEXIST; pfk->registered |= (1<<hdr->sadb_msg_satype); } mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); mutex_unlock(&pfkey_mutex); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); return -ENOBUFS; } pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put_data(skb, ihdr, sizeof(struct sadb_msg)); hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); hdr->sadb_msg_type = SADB_FLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int proto; struct km_event c; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ err = 0; return err ? err : err2; } c.data.proto = proto; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_FLUSHSA; c.net = net; km_state_notify(NULL, &c); return 0; } static int dump_sa(struct xfrm_state *x, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_state2msg(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_DUMP; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sa(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); } static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } filter = kmalloc_obj(*filter); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, sizeof(xfrm_address_t)); filter->family = xfilter->sadb_x_filter_family; filter->splen = xfilter->sadb_x_filter_splen; filter->dplen = xfilter->sadb_x_filter_dplen; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); int satype = hdr->sadb_msg_satype; bool reset_errno = false; if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { reset_errno = true; if (satype != 0 && satype != 1) return -EINVAL; pfk->promisc = satype; } if (reset_errno && skb_cloned(skb)) skb = skb_copy(skb, GFP_KERNEL); else skb = skb_clone(skb, GFP_KERNEL); if (reset_errno && skb) { struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; new_hdr->sadb_msg_errno = 0; } pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) { int i; u32 reqid = *(u32*)ptr; for (i=0; i<xp->xfrm_nr; i++) { if (xp->xfrm_vec[i].reqid == reqid) return -EEXIST; } return 0; } static u32 gen_reqid(struct net *net) { struct xfrm_policy_walk walk; u32 start; int rc; static u32 reqid = IPSEC_MANUAL_REQID_MAX; start = reqid; do { ++reqid; if (reqid == 0) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); return 0; } static int parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; int mode; if (xp->xfrm_nr >= XFRM_MAX_DEPTH) return -ELOOP; if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) return -EINVAL; t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) return -EINVAL; t->optional = 1; } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; if (!t->reqid && !(t->reqid = gen_reqid(net))) return -ENOBUFS; } /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { int err; err = parse_sockaddr_pair( (struct sockaddr *)(rq + 1), rq->sadb_x_ipsecrequest_len - sizeof(*rq), &t->saddr, &t->id.daddr, &t->encap_family); if (err) return err; } else t->encap_family = xp->family; /* No way to set this via kame pfkey */ t->allalgs = 1; xp->xfrm_nr++; return 0; } static int parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) { int err; int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); struct sadb_x_ipsecrequest *rq = (void*)(pol+1); if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; while (len >= sizeof(*rq)) { if (len < rq->sadb_x_ipsecrequest_len || rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); } return 0; } static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); len += xfrm_ctx->ctx_len; return PFKEY_ALIGN8(len); } return 0; } static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) { const struct xfrm_tmpl *t; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = 0; int i; for (i=0; i<xp->xfrm_nr; i++) { t = xp->xfrm_vec + i; socklen += pfkey_sockaddr_len(t->encap_family); } return sizeof(struct sadb_msg) + (sizeof(struct sadb_lifetime) * 3) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + (socklen * 2) + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) { struct sk_buff *skb; int size; size = pfkey_xfrm_policy2msg_size(xp); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); return skb; } static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) { struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int i; int size; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = pfkey_sockaddr_len(xp->family); size = pfkey_xfrm_policy2msg_size(xp); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_s; addr->sadb_address_reserved = 0; if (!pfkey_sockaddr_fill(&xp->selector.saddr, xp->selector.sport, (struct sockaddr *) (addr + 1), xp->family)) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_d; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, (struct sockaddr *) (addr + 1), xp->family); /* hard time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; /* soft time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = xp->curlft.packets; lifetime->sadb_lifetime_bytes = xp->curlft.bytes; lifetime->sadb_lifetime_addtime = xp->curlft.add_time; lifetime->sadb_lifetime_usetime = xp->curlft.use_time; pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; if (xp->action == XFRM_POLICY_ALLOW) { if (xp->xfrm_nr) pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; else pol->sadb_x_policy_type = IPSEC_POLICY_NONE; } pol->sadb_x_policy_dir = dir+1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; for (i=0; i<xp->xfrm_nr; i++) { const struct xfrm_tmpl *t = xp->xfrm_vec + i; struct sadb_x_ipsecrequest *rq; int req_size; int mode; req_size = sizeof(struct sadb_x_ipsecrequest); if (t->mode == XFRM_MODE_TUNNEL) { socklen = pfkey_sockaddr_len(t->encap_family); req_size += socklen * 2; } else { size -= 2*socklen; } rq = skb_put(skb, req_size); pol->sadb_x_policy_len += req_size/8; memset(rq, 0, sizeof(*rq)); rq->sadb_x_ipsecrequest_len = req_size; rq->sadb_x_ipsecrequest_proto = t->id.proto; if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) return -EINVAL; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; if (t->reqid) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; if (t->optional) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; rq->sadb_x_ipsecrequest_reqid = t->reqid; if (t->mode == XFRM_MODE_TUNNEL) { u8 *sa = (void *)(rq + 1); pfkey_sockaddr_fill(&t->saddr, 0, (struct sockaddr *)sa, t->encap_family); pfkey_sockaddr_fill(&t->id.daddr, 0, (struct sockaddr *) (sa + socklen), t->encap_family); } } /* security context */ if ((xfrm_ctx = xp->security)) { int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); sec_ctx = skb_put(skb, ctx_size); sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = refcount_read(&xp->refcnt); return 0; } static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; else out_hdr->sadb_msg_type = event2poltype(c->event); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) return -EINVAL; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; xp = xfrm_policy_alloc(net, GFP_KERNEL); if (xp == NULL) return -ENOBUFS; xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; /* Amusing, we set this twice. KAME apps appear to set same value * in both addresses. */ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.dport) xp->selector.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) goto out; } xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (err = parse_ipsecrequests(xp, pol)) < 0) goto out; err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; else c.event = XFRM_MSG_NEWPOLICY; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); xfrm_pol_put(xp); return 0; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return err; } static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct xfrm_selector sel; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *pol_ctx = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; memset(&sel, 0, sizeof(sel)); sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); if (xp == NULL) return -ENOENT; xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 0; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); out: xfrm_pol_put(xp); return err; } static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) { int err; struct sk_buff *out_skb; struct sadb_msg *out_hdr; err = 0; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) { err = PTR_ERR(out_skb); goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); goto out; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = hdr->sadb_msg_type; out_hdr->sadb_msg_satype = 0; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: return err; } static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); } static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family) { int af, socklen; if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); if (!af) return -EINVAL; socklen = pfkey_sockaddr_len(af); if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), daddr) != af) return -EINVAL; *family = af; return 0; } #ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { int err; struct sadb_x_ipsecrequest *rq2; int mode; if (len < sizeof(*rq1) || len < rq1->sadb_x_ipsecrequest_len || rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) return err; rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; if (len <= sizeof(*rq2) || len < rq2->sadb_x_ipsecrequest_len || rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) return err; if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) return -EINVAL; m->proto = rq1->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; m->mode = mode; m->reqid = rq1->sadb_x_ipsecrequest_reqid; return ((int)(rq1->sadb_x_ipsecrequest_len + rq2->sadb_x_ipsecrequest_len)); } static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { int i, len, ret, err = -EINVAL; u8 dir; struct sadb_address *sa; struct sadb_x_kmaddress *kma; struct sadb_x_policy *pol; struct sadb_x_ipsecrequest *rq; struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || !ext_hdrs[SADB_X_EXT_POLICY - 1]) { err = -EINVAL; goto out; } kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { err = -EINVAL; goto out; } if (kma) { /* convert sadb_x_kmaddress to xfrm_kmaddress */ k.reserved = kma->sadb_x_kmaddress_reserved; ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), &k.local, &k.remote, &k.family); if (ret < 0) { err = ret; goto out; } } dir = pol->sadb_x_policy_dir - 1; memset(&sel, 0, sizeof(sel)); /* set source address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); /* set destination address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); rq = (struct sadb_x_ipsecrequest *)(pol + 1); /* extract ipsecrequests */ i = 0; len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); while (len > 0 && i < XFRM_MAX_DEPTH) { ret = ipsecrequests_to_migrate(rq, len, &m[i]); if (ret < 0) { err = ret; goto out; } else { rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); len -= ret; i++; } } if (!i || len > 0) { err = -EINVAL; goto out; } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, kma ? &k : NULL, net, NULL, 0, NULL, NULL); out: return err; } #else static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -ENOPROTOOPT; } #endif static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int dir; int err = 0, delete; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) return -EINVAL; dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); if (dir >= XFRM_POLICY_MAX) return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; if (delete) { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 1; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, dir, &c); } else { err = key_pol_get_resp(sk, xp, hdr, dir); } out: xfrm_pol_put(xp); return err; } static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_X_SPDDUMP; out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sp(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); } static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { struct net *net = sock_net((struct sock *)pfk); xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int key_notify_policy_flush(const struct km_event *c) { struct sk_buff *skb_out; struct sadb_msg *hdr; skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb_out) return -ENOBUFS; hdr = skb_put(skb_out, sizeof(struct sadb_msg)); hdr->sadb_msg_type = SADB_X_SPDFLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct km_event c; int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ return 0; return err; } c.data.type = XFRM_POLICY_TYPE_MAIN; c.event = XFRM_MSG_FLUSHPOLICY; c.portid = hdr->sadb_msg_pid; c.seq = hdr->sadb_msg_seq; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, [SADB_ADD] = pfkey_add, [SADB_DELETE] = pfkey_delete, [SADB_GET] = pfkey_get, [SADB_ACQUIRE] = pfkey_acquire, [SADB_REGISTER] = pfkey_register, [SADB_EXPIRE] = NULL, [SADB_FLUSH] = pfkey_flush, [SADB_DUMP] = pfkey_dump, [SADB_X_PROMISC] = pfkey_promisc, [SADB_X_PCHANGE] = NULL, [SADB_X_SPDUPDATE] = pfkey_spdadd, [SADB_X_SPDADD] = pfkey_spdadd, [SADB_X_SPDDELETE] = pfkey_spddelete, [SADB_X_SPDGET] = pfkey_spdget, [SADB_X_SPDACQUIRE] = NULL, [SADB_X_SPDDUMP] = pfkey_spddump, [SADB_X_SPDFLUSH] = pfkey_spdflush, [SADB_X_SPDSETIDX] = pfkey_spdadd, [SADB_X_SPDDELETE2] = pfkey_spdget, [SADB_X_MIGRATE] = pfkey_migrate, }; static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) { void *ext_hdrs[SADB_EXT_MAX]; int err; /* Non-zero return value of pfkey_broadcast() does not always signal * an error and even on an actual error we may still want to process * the message so rather ignore the return value. */ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); if (!err) { err = -EOPNOTSUPP; if (pfkey_funcs[hdr->sadb_msg_type]) err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); } return err; } static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) { struct sadb_msg *hdr = NULL; if (skb->len < sizeof(*hdr)) { *errp = -EMSGSIZE; } else { hdr = (struct sadb_msg *) skb->data; if (hdr->sadb_msg_version != PF_KEY_V2 || hdr->sadb_msg_reserved != 0 || (hdr->sadb_msg_type <= SADB_RESERVED || hdr->sadb_msg_type > SADB_MAX)) { hdr = NULL; *errp = -EINVAL; } else if (hdr->sadb_msg_len != (skb->len / sizeof(uint64_t)) || hdr->sadb_msg_len < (sizeof(struct sadb_msg) / sizeof(uint64_t))) { hdr = NULL; *errp = -EMSGSIZE; } else { *errp = 0; } } return hdr; } static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->aalgos) * 8) return 0; return (t->aalgos >> id) & 1; } static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->ealgos) * 8) return 0; return (t->ealgos >> id) & 1; } static int count_ah_combs(const struct xfrm_tmpl *t) { int i, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); } static int count_esp_combs(const struct xfrm_tmpl *t) { int i, k, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = skb_put_zero(skb, sizeof(struct sadb_comb)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i=0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { struct sadb_comb *c; const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = skb_put(skb, sizeof(struct sadb_comb)); memset(c, 0, sizeof(*c)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) { return 0; } static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int hard; int hsc; hard = c->data.hard; if (hard) hsc = 2; else hsc = 1; out_skb = pfkey_xfrm_state2msg_expire(x, hsc); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; out_hdr->sadb_msg_type = SADB_EXPIRE; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = x ? xs_net(x) : c->net; struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); if (atomic_read(&net_pfkey->socks_nr) == 0) return 0; switch (c->event) { case XFRM_MSG_EXPIRE: return key_notify_sa_expire(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_NEWSA: case XFRM_MSG_UPDSA: return key_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return key_notify_sa_flush(c); case XFRM_MSG_NEWAE: /* not yet supported */ break; default: pr_err("pfkey: Unknown SA event %d\n", c->event); break; } return 0; } static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) return 0; switch (c->event) { case XFRM_MSG_POLEXPIRE: return key_notify_policy_expire(xp, c); case XFRM_MSG_DELPOLICY: case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: return key_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: if (c->data.type != XFRM_POLICY_TYPE_MAIN) break; return key_notify_policy_flush(c); default: pr_err("pfkey: Unknown policy event %d\n", c->event); break; } return 0; } static u32 get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } static bool pfkey_is_alive(const struct km_event *c) { struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); struct sock *sk; bool is_alive = false; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { if (pfkey_sk(sk)->registered) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_x_policy *pol; int sockaddr_size; int size; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; size = sizeof(struct sadb_msg) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_ACQUIRE; hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq = get_acqseq(); hdr->sadb_msg_pid = 0; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ alg_size = 0; if (x->id.proto == IPPROTO_AH) alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) alg_size = dump_esp_combs(skb, t); hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; struct sadb_x_sec_ctx *sec_ctx; switch (sk->sk_family) { case AF_INET: if (opt != IP_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(struct sadb_x_policy) || pol->sadb_x_policy_len*8 > len || pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; xp->family = sk->sk_family; xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; /* security context too */ if (len >= (pol->sadb_x_policy_len*8 + sizeof(struct sadb_x_sec_ctx))) { char *p = (char *)pol; struct xfrm_user_sec_ctx *uctx; p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) goto out; } *dir = pol->sadb_x_policy_dir-1; return xp; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_address *addr; struct sadb_x_nat_t_port *n_port; int sockaddr_size; int size; __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); struct xfrm_encap_tmpl *natt = NULL; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; if (!satype) return -EINVAL; if (!x->encap) return -EINVAL; natt = x->encap; /* Build an SADB_X_NAT_T_NEW_MAPPING message: * * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) */ size = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + (sizeof(struct sadb_x_nat_t_port) * 2); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; hdr->sadb_msg_satype = satype; hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = 0; sa->sadb_sa_state = 0; sa->sadb_sa_auth = 0; sa->sadb_sa_encrypt = 0; sa->sadb_sa_flags = 0; /* ADDRESS_SRC (old addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_SPORT (old port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* ADDRESS_DST (new addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(ipaddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_DPORT (new port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE static int set_sadb_address(struct sk_buff *skb, int sasize, int type, const struct xfrm_selector *sel) { struct sadb_address *addr; addr = skb_put(skb, sizeof(struct sadb_address) + sasize); addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; addr->sadb_address_exttype = type; addr->sadb_address_proto = sel->proto; addr->sadb_address_reserved = 0; switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; pfkey_sockaddr_fill(&sel->saddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; pfkey_sockaddr_fill(&sel->daddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; default: return -EINVAL; } return 0; } static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) { struct sadb_x_kmaddress *kma; u8 *sa; int family = k->family; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = (sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(family)); kma = skb_put_zero(skb, size_req); kma->sadb_x_kmaddress_len = size_req / 8; kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; kma->sadb_x_kmaddress_reserved = k->reserved; sa = (u8 *)(kma + 1); if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) return -EINVAL; return 0; } static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, uint32_t reqid, uint8_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; u8 *sa; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(family); rq = skb_put_zero(skb, size_req); rq->sadb_x_ipsecrequest_len = size_req; rq->sadb_x_ipsecrequest_proto = proto; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = level; rq->sadb_x_ipsecrequest_reqid = reqid; sa = (u8 *) (rq + 1); if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) return -EINVAL; return 0; } #endif #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int i; int sasize_sel; int size = 0; int size_pol = 0; struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_x_policy *pol; const struct xfrm_migrate *mp; if (type != XFRM_POLICY_TYPE_MAIN) return 0; if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) return -EINVAL; if (k != NULL) { /* addresses for KM */ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(k->family)); } /* selector */ sasize_sel = pfkey_sockaddr_size(sel->family); if (!sasize_sel) return -EINVAL; size += (sizeof(struct sadb_address) + sasize_sel) * 2; /* policy info */ size_pol += sizeof(struct sadb_x_policy); /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->old_family); /* new locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->new_family); } size += sizeof(struct sadb_msg) + size_pol; /* alloc buffer */ skb = alloc_skb(size, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_MIGRATE; hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); hdr->sadb_msg_len = size / 8; hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = 0; hdr->sadb_msg_pid = 0; /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); /* selector dst */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); /* policy information */ pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = size_pol / 8; pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = dir + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = 0; pol->sadb_x_policy_priority = 0; for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old ipsecrequest */ int mode = pfkey_mode_from_xfrm(mp->mode); if (mode < 0) goto err; if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->old_family, &mp->old_saddr, &mp->old_daddr) < 0) goto err; /* new ipsecrequest */ if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->new_family, &mp->new_saddr, &mp->new_daddr) < 0) goto err; } /* broadcast migrate message to sockets */ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; err: kfree_skb(skb); return -EINVAL; } #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) goto out; err = -EMSGSIZE; if ((unsigned int)len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); if (skb == NULL) goto out; err = -EFAULT; if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); if (!hdr) goto out; mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; kfree_skb(skb); return err ? : len; } static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *skb; int copied, err; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) goto out; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; sock_recv_cmsgs(msg, sk, skb); err = (flags & MSG_TRUNC) ? skb->len : copied; if (pfk->dump.dump != NULL && 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) pfkey_do_dump(pfk); out_free: skb_free_datagram(sk, skb); out: return err; } static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, /* Now the operations that really occur. */ .release = pfkey_release, .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; static const struct net_proto_family pfkey_family_ops = { .family = PF_KEY, .create = pfkey_create, .owner = THIS_MODULE, }; #ifdef CONFIG_PROC_FS static int pfkey_seq_show(struct seq_file *f, void *v) { struct sock *s = sk_entry(v); if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), from_kuid_munged(seq_user_ns(f), sk_uid(s)), sock_i_ino(s) ); return 0; } static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) __acquires(rcu) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); rcu_read_lock(); return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); } static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); } static void pfkey_seq_stop(struct seq_file *f, void *v) __releases(rcu) { rcu_read_unlock(); } static const struct seq_operations pfkey_seq_ops = { .start = pfkey_seq_start, .next = pfkey_seq_next, .stop = pfkey_seq_stop, .show = pfkey_seq_show, }; static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops, sizeof(struct seq_net_private)); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit pfkey_exit_proc(struct net *net) { remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) { return 0; } static inline void pfkey_exit_proc(struct net *net) { } #endif static struct xfrm_mgr pfkeyv2_mgr = { .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); int rv; INIT_HLIST_HEAD(&net_pfkey->table); atomic_set(&net_pfkey->socks_nr, 0); rv = pfkey_init_proc(net); return rv; } static void __net_exit pfkey_net_exit(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { .init = pfkey_net_init, .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), }; static void __exit ipsec_pfkey_exit(void) { xfrm_unregister_km(&pfkeyv2_mgr); sock_unregister(PF_KEY); unregister_pernet_subsys(&pfkey_net_ops); proto_unregister(&key_proto); } static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, " "please contact the netdev mailing list\n"); if (err != 0) goto out; err = register_pernet_subsys(&pfkey_net_ops); if (err != 0) goto out_unregister_key_proto; err = sock_register(&pfkey_family_ops); if (err != 0) goto out_unregister_pernet; xfrm_register_km(&pfkeyv2_mgr); out: return err; out_unregister_pernet: unregister_pernet_subsys(&pfkey_net_ops); out_unregister_key_proto: proto_unregister(&key_proto); goto out; } module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); MODULE_DESCRIPTION("PF_KEY socket helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY);
7 7 2 3 3 21 21 8 8 21 8 8 8 8 8 20 3 17 86 87 1 86 2 81 75 2 3 80 3 54 23 3 23 57 19 36 26 2 37 47 47 30 18 47 47 47 20 7 35 17 1 9 20 47 47 8 17 25 4 4 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/in6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip6_net_id; struct l2tp_ip6_net { rwlock_t l2tp_ip6_lock; struct hlist_head l2tp_ip6_table; struct hlist_head l2tp_ip6_bind_table; }; struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; struct ipv6_pinfo inet6; }; static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) { return net_generic(net, l2tp_ip6_net_id); } static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) continue; if (!ipv6_addr_any(sk_raddr) && raddr && !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; pn = l2tp_ip6_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip6_hash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip6_lock); sk_add_node(sk, &pn->l2tp_ip6_table); write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip6_hash(sk); return 0; } static void l2tp_ip6_close(struct sock *sk, long timeout) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; pn = l2tp_ip6_pernet(net); if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) return -EINVAL; addr_type = ipv6_addr_type(&addr->l2tp_addr); /* l2tp_ip6 sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; /* L2TP is point-point, not multicast */ if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out_unlock; if (sk->sk_state != TCP_CLOSE) goto out_unlock; bound_dev_if = sk->sk_bound_dev_if; /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr->l2tp_scope_id) bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an * interface. */ if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0)) goto out_unlock_rcu; } rcu_read_unlock(); write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } inet->inet_saddr = v4addr; inet->inet_rcv_saddr = v4addr; sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); return 0; out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); return err; } static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; if (addr_type & IPV6_ADDR_MAPPED) { daddr = &usin->sin6_addr; if (ipv4_is_multicast(daddr->s6_addr32[3])) return -EINVAL; } lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip6_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip6_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk); lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) lsa->l2tp_addr = np->saddr; else lsa->l2tp_addr = sk->sk_v6_rcv_saddr; lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } static int l2tp_ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; __be32 *transhdr = NULL; int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); *transhdr = 0; err = ip6_push_pending_frames(sk); out: return err; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && lsa->l2tp_scope_id && ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = lsa->l2tp_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: lock_sock(sk); ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = l2tp_ip6_push_pending_frames(sk); release_sock(sk); done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (lsa) { lsa->l2tp_family = AF_INET6; lsa->l2tp_unused = 0; lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); *addr_len = sizeof(*lsa); } if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } static struct proto l2tp_ip6_prot = { .name = "L2TP/IPv6", .owner = THIS_MODULE, .init = l2tp_ip6_open, .close = l2tp_ip6_close, .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw l2tp_ip6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; static __net_init int l2tp_ip6_init_net(struct net *net) { struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); rwlock_init(&pn->l2tp_ip6_lock); INIT_HLIST_HEAD(&pn->l2tp_ip6_table); INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); return 0; } static __net_exit void l2tp_ip6_exit_net(struct net *net) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); write_lock_bh(&pn->l2tp_ip6_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip6_lock); } static struct pernet_operations l2tp_ip6_net_ops = { .init = l2tp_ip6_init_net, .exit = l2tp_ip6_exit_net, .id = &l2tp_ip6_net_id, .size = sizeof(struct l2tp_ip6_net), }; static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip6_net_ops); if (err) goto out; err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; out2: proto_unregister(&l2tp_ip6_prot); out1: unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } static void __exit l2tp_ip6_exit(void) { inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); module_exit(l2tp_ip6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chris Elston <celston@katalix.com>"); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
13 10 5 2 3 2 1 10 10 10 10 10 10 13 13 13 13 10 10 10 10 10 5 4 1 4 4 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 // SPDX-License-Identifier: GPL-2.0 /* * To speed up listener socket lookup, create an array to store all sockets * listening on the same port. This allows a decision to be made after finding * the first socket. An optional BPF program can also be configured for * selecting the socket index from the array of available sockets. */ #include <net/ip.h> #include <net/sock_reuseport.h> #include <linux/bpf.h> #include <linux/idr.h> #include <linux/filter.h> #include <linux/rcupdate.h> #define INIT_SOCKS 128 DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); void reuseport_has_conns_set(struct sock *sk) { struct sock_reuseport *reuse; if (!rcu_access_pointer(sk->sk_reuseport_cb)) return; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (likely(reuse)) reuse->has_conns = 1; spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_has_conns_set); static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) { /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); } static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) { /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); } static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) { if (sk->sk_incoming_cpu >= 0) __reuseport_get_incoming_cpu(reuse); } static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) { if (sk->sk_incoming_cpu >= 0) __reuseport_put_incoming_cpu(reuse); } void reuseport_update_incoming_cpu(struct sock *sk, int val) { struct sock_reuseport *reuse; int old_sk_incoming_cpu; if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { /* Paired with REAE_ONCE() in sk_incoming_cpu_update() * and compute_score(). */ WRITE_ONCE(sk->sk_incoming_cpu, val); return; } spin_lock_bh(&reuseport_lock); /* This must be done under reuseport_lock to avoid a race with * reuseport_grow(), which accesses sk->sk_incoming_cpu without * lock_sock() when detaching a shutdown()ed sk. * * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ old_sk_incoming_cpu = sk->sk_incoming_cpu; WRITE_ONCE(sk->sk_incoming_cpu, val); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuseport_grow() has detached a closed sk. */ if (!reuse) goto out; if (old_sk_incoming_cpu < 0 && val >= 0) __reuseport_get_incoming_cpu(reuse); else if (old_sk_incoming_cpu >= 0 && val < 0) __reuseport_put_incoming_cpu(reuse); out: spin_unlock_bh(&reuseport_lock); } static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) { int left, right; if (!closed) { left = 0; right = reuse->num_socks; } else { left = reuse->max_socks - reuse->num_closed_socks; right = reuse->max_socks; } for (; left < right; left++) if (reuse->socks[left] == sk) return left; return -1; } static void __reuseport_add_sock(struct sock *sk, struct sock_reuseport *reuse) { reuse->socks[reuse->num_socks] = sk; /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ smp_wmb(); reuse->num_socks++; reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_sock(struct sock *sk, struct sock_reuseport *reuse) { int i = reuseport_sock_index(sk, reuse, false); if (i == -1) return false; reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; reuse->num_socks--; reuseport_put_incoming_cpu(sk, reuse); return true; } static void __reuseport_add_closed_sock(struct sock *sk, struct sock_reuseport *reuse) { reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_closed_sock(struct sock *sk, struct sock_reuseport *reuse) { int i = reuseport_sock_index(sk, reuse, true); if (i == -1) return false; reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); reuseport_put_incoming_cpu(sk, reuse); return true; } static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { struct sock_reuseport *reuse; reuse = kzalloc_flex(*reuse, socks, max_socks, GFP_ATOMIC); if (!reuse) return NULL; reuse->max_socks = max_socks; RCU_INIT_POINTER(reuse->prog, NULL); return reuse; } int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; int id, ret = 0; /* bh lock used since this function call may precede hlist lock in * soft irq of receive path or setsockopt from process context */ spin_lock_bh(&reuseport_lock); /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { if (reuse->num_closed_socks) { /* sk was shutdown()ed before */ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); goto out; } /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. */ if (bind_inany) reuse->bind_inany = bind_inany; goto out; } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { ret = -ENOMEM; goto out; } id = ida_alloc(&reuseport_ida, GFP_ATOMIC); if (id < 0) { kfree(reuse); ret = id; goto out; } reuse->reuseport_id = id; reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; reuseport_get_incoming_cpu(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: spin_unlock_bh(&reuseport_lock); return ret; } EXPORT_SYMBOL(reuseport_alloc); static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) { struct sock_reuseport *more_reuse; u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; if (more_socks_size > U16_MAX) { if (reuse->num_closed_socks) { /* Make room by removing a closed sk. * The child has already been migrated. * Only reqsk left at this point. */ struct sock *sk; sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); __reuseport_detach_closed_sock(sk, reuse); return reuse; } return NULL; } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) return NULL; more_reuse->num_socks = reuse->num_socks; more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; more_reuse->has_conns = reuse->has_conns; more_reuse->incoming_cpu = reuse->incoming_cpu; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); memcpy(more_reuse->socks + (more_reuse->max_socks - more_reuse->num_closed_socks), reuse->socks + (reuse->max_socks - reuse->num_closed_socks), reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); /* Note: we use kfree_rcu here instead of reuseport_free_rcu so * that reuse and more_reuse can temporarily share a reference * to prog. */ kfree_rcu(reuse, rcu); return more_reuse; } static void reuseport_free_rcu(struct rcu_head *head) { struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); ida_free(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } /** * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. * @sk2: Socket belonging to the existing reuseport group. * @bind_inany: Whether or not the group is bound to a local INANY address. * * May return ENOMEM and not add socket to group under memory pressure. */ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { int err = reuseport_alloc(sk2, bind_inany); if (err) return err; } spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (old_reuse && old_reuse->num_closed_socks) { /* sk was shutdown()ed before */ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); spin_unlock_bh(&reuseport_lock); return err; } if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; } if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); return -ENOMEM; } } __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); if (old_reuse) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } EXPORT_SYMBOL(reuseport_add_sock); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany) { if (old_reuse == reuse) { /* If sk was in the same reuseport group, just pop sk out of * the closed section and push sk into the listening section. */ __reuseport_detach_closed_sock(sk, old_reuse); __reuseport_add_sock(sk, old_reuse); return 0; } if (!reuse) { /* In bind()/listen() path, we cannot carry over the eBPF prog * for the shutdown()ed socket. In setsockopt() path, we should * not change the eBPF prog of listening sockets by attaching a * prog to the shutdown()ed socket. Thus, we will allocate a new * reuseport group and detach sk from the old group. */ int id; reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) return -ENOMEM; id = ida_alloc(&reuseport_ida, GFP_ATOMIC); if (id < 0) { kfree(reuse); return id; } reuse->reuseport_id = id; reuse->bind_inany = bind_inany; } else { /* Move sk from the old group to the new one if * - all the other listeners in the old group were close()d or * shutdown()ed, and then sk2 has listen()ed on the same port * OR * - sk listen()ed without bind() (or with autobind), was * shutdown()ed, and then listen()s on another port which * sk2 listen()s on. */ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) return -ENOMEM; } } __reuseport_detach_closed_sock(sk, old_reuse); __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuseport_grow() has detached a closed sk */ if (!reuse) goto out; /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * * Other bpf map types that work with reuseport, like sockmap, * don't need an explicit callback from here. They override sk * unhash/close ops to remove the sk from the map before we * get to this point. */ bpf_sk_reuseport_detach(sk); rcu_assign_pointer(sk->sk_reuseport_cb, NULL); if (!__reuseport_detach_closed_sock(sk, reuse)) __reuseport_detach_sock(sk, reuse); if (reuse->num_socks + reuse->num_closed_socks == 0) call_rcu(&reuse->rcu, reuseport_free_rcu); out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); void reuseport_stop_listen_sock(struct sock *sk) { if (sk->sk_protocol == IPPROTO_TCP) { struct sock_reuseport *reuse; struct bpf_prog *prog; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. */ bpf_sk_reuseport_detach(sk); __reuseport_detach_sock(sk, reuse); __reuseport_add_closed_sock(sk, reuse); spin_unlock_bh(&reuseport_lock); return; } spin_unlock_bh(&reuseport_lock); } /* Not capable to do migration, detach immediately */ reuseport_detach_sock(sk); } EXPORT_SYMBOL(reuseport_stop_listen_sock); static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) { struct sk_buff *nskb = NULL; u32 index; if (skb_shared(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return NULL; skb = nskb; } /* temporarily advance data past protocol header */ if (!pskb_pull(skb, hdr_len)) { kfree_skb(nskb); return NULL; } index = bpf_prog_run_save_cb(prog, skb); __skb_push(skb, hdr_len); consume_skb(nskb); if (index >= socks) return NULL; return reuse->socks[index]; } static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, u32 hash, u16 num_socks) { struct sock *first_valid_sk = NULL; int i, j; i = j = reciprocal_scale(hash, num_socks); do { struct sock *sk = reuse->socks[i]; if (sk->sk_state != TCP_ESTABLISHED) { /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ if (!READ_ONCE(reuse->incoming_cpu)) return sk; /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) return sk; if (!first_valid_sk) first_valid_sk = sk; } i++; if (i >= num_socks) i = 0; } while (i != j); return first_valid_sk; } /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. * @hash: When no BPF filter is available, use this hash to select. * @skb: skb to run through BPF filter. * @hdr_len: BPF filter expects skb data pointer at payload data. If * the skb does not yet point at the payload, this parameter represents * how far the pointer needs to advance to reach the payload. * Returns a socket that should receive the packet (or NULL on error). */ struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len) { struct sock_reuseport *reuse; struct bpf_prog *prog; struct sock *sk2 = NULL; u16 socks; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); /* if memory allocation failed or add call is not yet complete */ if (!reuse) goto out; prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: rcu_read_unlock(); return sk2; } EXPORT_SYMBOL(reuseport_select_sock); /** * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. * @sk: close()ed or shutdown()ed socket in the group. * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or * NEW_SYN_RECV request socket during 3WHS. * @skb: skb to run through BPF filter. * Returns a socket (with sk_refcnt +1) that should accept the child socket * (or NULL on error). */ struct sock *reuseport_migrate_sock(struct sock *sk, struct sock *migrating_sk, struct sk_buff *skb) { struct sock_reuseport *reuse; struct sock *nsk = NULL; bool allocated = false; struct bpf_prog *prog; u16 socks; u32 hash; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); if (!reuse) goto out; socks = READ_ONCE(reuse->num_socks); if (unlikely(!socks)) goto failure; /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); hash = migrating_sk->sk_hash; prog = rcu_dereference(reuse->prog); if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) goto select_by_hash; goto failure; } if (!skb) { skb = alloc_skb(0, GFP_ATOMIC); if (!skb) goto failure; allocated = true; } nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); if (allocated) kfree_skb(skb); select_by_hash: if (!nsk) nsk = reuseport_select_sock_by_hash(reuse, hash, socks); if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { nsk = NULL; goto failure; } out: rcu_read_unlock(); return nsk; failure: __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); goto out; } EXPORT_SYMBOL(reuseport_migrate_sock); int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; if (sk_unhashed(sk)) { int err; if (!sk->sk_reuseport) return -EINVAL; err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { /* The socket wasn't bound with SO_REUSEPORT */ return -EINVAL; } spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); sk_reuseport_prog_free(old_prog); return 0; } EXPORT_SYMBOL(reuseport_attach_prog); int reuseport_detach_prog(struct sock *sk) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuse must be checked after acquiring the reuseport_lock * because reuseport_grow() can detach a closed sk. */ if (!reuse) { spin_unlock_bh(&reuseport_lock); return sk->sk_reuseport ? -ENOENT : -EINVAL; } if (sk_unhashed(sk) && reuse->num_closed_socks) { spin_unlock_bh(&reuseport_lock); return -ENOENT; } old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); if (!old_prog) return -ENOENT; sk_reuseport_prog_free(old_prog); return 0; } EXPORT_SYMBOL(reuseport_detach_prog);
15 3 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@CAMBRIDGE.REDHAT.COM> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe_s(const struct key *, struct seq_file *); /* * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the * description and the key material as the payload. */ struct key_type key_type_rxrpc_s = { .name = "rxrpc_s", .flags = KEY_TYPE_NET_DOMAIN, .vet_description = rxrpc_vet_description_s, .preparse = rxrpc_preparse_s, .free_preparse = rxrpc_free_preparse_s, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy_s, .describe = rxrpc_describe_s, }; /* * Vet the description for an RxRPC server key. */ static int rxrpc_vet_description_s(const char *desc) { unsigned long service, sec_class; char *p; service = simple_strtoul(desc, &p, 10); if (*p != ':' || service > 65535) return -EINVAL; sec_class = simple_strtoul(p + 1, &p, 10); if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) return -EINVAL; return 0; } /* * Preparse a server secret key. */ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec; unsigned int service, sec_class; int n; _enter("%zu", prep->datalen); if (!prep->orig_description) return -EINVAL; if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) return -EINVAL; sec = rxrpc_security_lookup(sec_class); if (!sec) return -ENOPKG; prep->payload.data[1] = (struct rxrpc_security *)sec; if (!sec->preparse_server_key) return -EINVAL; return sec->preparse_server_key(prep); } static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec = prep->payload.data[1]; if (sec && sec->free_preparse_server_key) sec->free_preparse_server_key(prep); } static void rxrpc_destroy_s(struct key *key) { const struct rxrpc_security *sec = key->payload.data[1]; if (sec && sec->destroy_server_key) sec->destroy_server_key(key); } static void rxrpc_describe_s(const struct key *key, struct seq_file *m) { const struct rxrpc_security *sec = key->payload.data[1]; seq_puts(m, key->description); if (sec && sec->describe_server_key) sec->describe_server_key(key, m); } /* * grab the security keyring for a server socket */ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->securities = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /** * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service * @sk: The socket to set the keyring on * @keyring: The keyring to set * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. * * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; lock_sock(sk); if (rx->securities) ret = -EINVAL; else if (rx->sk.sk_state != RXRPC_UNBOUND) ret = -EISCONN; else rx->securities = key_get(keyring); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); /** * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service * @sk: The socket to set the keyring on * @set: True to set, false to clear the flag * * Set the flag on an rxrpc socket to say that the caller wants to manage the * RESPONSE packet and the user-defined data it may contain. Setting this * means that recvmsg() will return messages with RXRPC_CHALLENGED in the * control message buffer containing information about the challenge. * * The user should respond to the challenge by passing RXRPC_RESPOND or * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be * included to indicate the parts the user wants to supply. * * The server will be passed the response data with a RXRPC_RESPONDED control * message when it gets the first data from each call. * * Note that this is only honoured by security classes that need auxiliary data * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond * without consulting userspace. * * Return: The previous setting. */ int rxrpc_sock_set_manage_response(struct sock *sk, bool set) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret; lock_sock(sk); ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); if (set) set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); else clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_manage_response);
25 25 25 25 2 2 2 3 3 3 6 6 6 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #ifndef __NET_TC_SKBEDIT_H #define __NET_TC_SKBEDIT_H #include <net/act_api.h> #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { int action; u32 flags; u32 priority; u32 mark; u32 mask; u16 queue_mapping; u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; struct tcf_skbedit { struct tc_action common; struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is the one identified by FLAG. */ static inline bool is_tcf_skbedit_with_flag(const struct tc_action *a, u32 flag) { #ifdef CONFIG_NET_CLS_ACT u32 flags; if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); return flags == flag; } #endif return false; } /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_MARK); } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { u32 mark; rcu_read_lock(); mark = rcu_dereference(to_skbedit(a)->params)->mark; rcu_read_unlock(); return mark; } /* Return true iff action is ptype */ static inline bool is_tcf_skbedit_ptype(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PTYPE); } static inline u32 tcf_skbedit_ptype(const struct tc_action *a) { u16 ptype; rcu_read_lock(); ptype = rcu_dereference(to_skbedit(a)->params)->ptype; rcu_read_unlock(); return ptype; } /* Return true iff action is priority */ static inline bool is_tcf_skbedit_priority(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PRIORITY); } static inline u32 tcf_skbedit_priority(const struct tc_action *a) { u32 priority; rcu_read_lock(); priority = rcu_dereference(to_skbedit(a)->params)->priority; rcu_read_unlock(); return priority; } static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { u16 rx_queue; rcu_read_lock(); rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; rcu_read_unlock(); return rx_queue; } /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } /* Return true if action is on ingress traffic */ static inline bool is_tcf_skbedit_ingress(u32 flags) { return flags & TCA_ACT_FLAGS_AT_INGRESS; } static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && !is_tcf_skbedit_ingress(a->tcfa_flags); } static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && is_tcf_skbedit_ingress(a->tcfa_flags); } /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); } #endif /* __NET_TC_SKBEDIT_H */
17 12 9 17 9 6 2 2 10 10 1 1 6 6 1 5 5 3 2 2 9 2 6 2 10 10 7 7 3 7 7 7 7 7 7 7 2 5 2 5 7 15 15 15 1 15 1 15 3 1 2 2 2 3 4 1 3 9 6 4 2 2 3 15 15 9 9 1 15 15 15 4 1 3 4 9 9 4 8 26 26 1 18 9 34 8 7 8 8 8 8 10 10 10 10 35 27 27 27 19 10 1139 1143 17 17 10 2 1 1 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; int bc_cutoff; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ); DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev __rcu *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && rcu_access_pointer(entry->vlan) == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc_obj(*entry); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); RCU_INIT_POINTER(entry->vlan, vlan); h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { RCU_INIT_POINTER(entry->vlan, NULL); hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ eth_hw_addr_set(vlan->dev, addr); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = get_unaligned((u32 *)(addr + 2)); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_multicast_rx(const struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); macvlan_multicast_rx(port, src, skb); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); queue_work(system_dfl_wq, &port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (rcu_access_pointer(entry->vlan) == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { struct macvlan_dev *vlan = rcu_dereference(entry->vlan); if (!vlan) continue; if (vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->bc_filter)) macvlan_broadcast_enqueue(port, src, skb); else if (test_bit(hash, port->mc_filter)) macvlan_multicast_rx(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, const unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr_storage *addr = p; if (!is_valid_ether_addr(addr->__data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->__data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->__data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->__data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan, int cutoff) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); struct netdev_hw_addr *ha; bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { if (!vlan && ha->synced <= cutoff) continue; __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan) { if (vlan->port->bc_cutoff < 0) { bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ); return; } macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL, vlan->port->bc_cutoff); } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL, 0); macvlan_recompute_bc_filter(vlan); } static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff) { if (vlan->port->bc_cutoff == cutoff) return; vlan->port->bc_cutoff = cutoff; macvlan_recompute_bc_filter(vlan); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int macvlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return generic_hwtstamp_get_lower(real_dev, cfg); } static int macvlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = macvlan_dev_real_dev(dev); if (!net_eq(dev_net(dev), &init_net)) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; /* Get macvlan's reference to lowerdev */ netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return ethtool_get_ts_info_by_layer(real_dev, info); } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc_obj(*netpoll); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return READ_ONCE(vlan->lowerdev->ifindex); } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_hwtstamp_get = macvlan_hwtstamp_get, .ndo_hwtstamp_set = macvlan_hwtstamp_set, }; static void macvlan_dev_free(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ netdev_put(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->change_proto_down = true; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macvlan_dev_free; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc_obj(*port); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; port->bc_cutoff = 1; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr_storage ss; ss.ss_family = port->dev->type; memcpy(&ss.__data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &ss, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct macvlan_dev *vlan = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct net_device *lowerdev; struct macvlan_port *port; bool create = false; int macmode; int err; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); if (create) macvlan_port_destroy(port->dev); } /* @dev might have been made visible before an error was detected. * Make sure to observe an RCU grace period before our caller * (rtnl_newlink()) frees it. */ synchronize_net(); return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { return macvlan_common_newlink(dev, params, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (rcu_access_pointer(entry->vlan) != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; if (port->bc_cutoff != 1 && nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan");
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 // SPDX-License-Identifier: GPL-2.0-only /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/bio-integrity.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/rw_hint.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include <scsi/scsi_common.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim); static void sd_revalidate_disk(struct gendisk *); static DEFINE_IDA(sd_index_ida); static mempool_t *sd_page_pool; static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode) { unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; lim->discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } lim->max_hw_discard_sectors = max_blocks * (logical_block_size >> SECTOR_SHIFT); } static void sd_set_flush_flag(struct scsi_disk *sdkp, struct queue_limits *lim) { if (sdkp->WCE) { lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) lim->features |= BLK_FEAT_FUA; else lim->features &= ~BLK_FEAT_FUA; } else { lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len, ret; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { struct queue_limits lim; sdkp->WCE = wce; sdkp->RCD = rcd; lim = queue_limits_start_update(sdkp->disk->queue); sd_set_flush_flag(sdkp, &lim); ret = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (ret) return ret; return count; } if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT, sdkp->max_retries, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } sd_revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop && sdp->manage_runtime_start_stop && sdp->manage_shutdown); } static DEVICE_ATTR_RO(manage_start_stop); static ssize_t manage_system_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop); } static ssize_t manage_system_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_system_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_system_start_stop); static ssize_t manage_runtime_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop); } static ssize_t manage_runtime_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_runtime_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_runtime_start_stop); static ssize_t manage_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_shutdown); } static ssize_t manage_shutdown_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_shutdown = v; return count; } static DEVICE_ATTR_RW(manage_shutdown); static ssize_t manage_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_restart); } static ssize_t manage_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_restart = v; return count; } static DEVICE_ATTR_RW(manage_restart); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; lim = queue_limits_start_update(sdkp->disk->queue); sd_config_discard(sdkp, &lim, mode); err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } lim = queue_limits_start_update(sdkp->disk->queue); sd_config_write_same(sdkp, &lim); err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static ssize_t zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); if (sdkp->device->type == TYPE_ZBC) return sprintf(buf, "host-managed\n"); if (sdkp->zoned == 1) return sprintf(buf, "host-aware\n"); if (sdkp->zoned == 2) return sprintf(buf, "drive-managed\n"); return sprintf(buf, "none\n"); } static DEVICE_ATTR_RO(zoned_cap); static ssize_t max_retries_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdev = sdkp->device; int retries, err; err = kstrtoint(buf, 10, &retries); if (err) return err; if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) { sdkp->max_retries = retries; return count; } sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n", SD_MAX_RETRIES); return -EINVAL; } static ssize_t max_retries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%d\n", sdkp->max_retries); } static DEVICE_ATTR_RW(max_retries); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_manage_system_start_stop.attr, &dev_attr_manage_runtime_start_stop.attr, &dev_attr_manage_shutdown.attr, &dev_attr_manage_restart.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, &dev_attr_zoned_cap.attr, &dev_attr_max_retries.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static void scsi_disk_release(struct device *dev) { struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); kfree(sdkp); } static struct class sd_disk_class = { .name = "scsi_disk", .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; /* * Don't request a new module, as that could deadlock in multipath * environment. */ static void sd_default_probe(dev_t devt) { } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_disk *sdkp = data; struct scsi_device *sdev = sdkp->device; u8 cdb[12] = { 0, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, len, SD_TIMEOUT, sdkp->max_retries, &exec_args); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ /* * Look up the DIX operation based on whether the command is read or * write and whether dix and dif are enabled. */ static unsigned int sd_prot_op(bool write, bool dix, bool dif) { /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ static const unsigned int ops[] = { /* wrt dix dif */ SCSI_PROT_NORMAL, /* 0 0 0 */ SCSI_PROT_READ_STRIP, /* 0 0 1 */ SCSI_PROT_READ_INSERT, /* 0 1 0 */ SCSI_PROT_READ_PASS, /* 0 1 1 */ SCSI_PROT_NORMAL, /* 1 0 0 */ SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ SCSI_PROT_WRITE_PASS, /* 1 1 1 */ }; return ops[write << 2 | dix << 1 | dif]; } /* * Returns a mask of the protection flags that are valid for a given DIX * operation. */ static unsigned int sd_prot_flag_mask(unsigned int prot_op) { static const unsigned int flag_mask[] = { [SCSI_PROT_NORMAL] = 0, [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, }; return flag_mask[prot_op]; } static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct request *rq = scsi_cmd_to_rq(scmd); struct bio *bio = rq->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) { struct page *page; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; clear_highpage(page); bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; char *buf; buf = sd_set_special_bvec(rq, data_len); if (!buf) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); put_unaligned_be32(nr_blocks, &buf[16]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; return scsi_alloc_sgtables(cmd); } static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size, physical_block_size_sectors, max_atomic, unit_min, unit_max; if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || sdkp->protection_type == T10_PI_TYPE2_PROTECTION) return; physical_block_size_sectors = sdkp->physical_block_size / sdkp->device->sector_size; unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? sdkp->atomic_granularity : physical_block_size_sectors); /* * Only use atomic boundary when we have the odd scenario of * sdkp->max_atomic == 0, which the spec does permit. */ if (sdkp->max_atomic) { max_atomic = sdkp->max_atomic; unit_max = rounddown_pow_of_two(sdkp->max_atomic); sdkp->use_atomic_write_boundary = 0; } else { max_atomic = sdkp->max_atomic_with_boundary; unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); sdkp->use_atomic_write_boundary = 1; } /* * Ensure compliance with granularity and alignment. For now, keep it * simple and just don't support atomic writes for values mismatched * with max_{boundary}atomic, physical block size, and * atomic_granularity itself. * * We're really being distrustful by checking unit_max also... */ if (sdkp->atomic_granularity > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_granularity) return; if (unit_max > 1 && unit_max % sdkp->atomic_granularity) return; } if (sdkp->atomic_alignment > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_alignment) return; if (unit_max > 1 && unit_max % sdkp->atomic_alignment) return; } lim->atomic_write_hw_max = max_atomic * logical_block_size; lim->atomic_write_hw_boundary = 0; lim->atomic_write_hw_unit_min = unit_min * logical_block_size; lim->atomic_write_hw_unit_max = unit_max * logical_block_size; lim->features |= BLK_FEAT_ATOMIC_WRITES; } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: return sd_setup_write_same16_cmnd(cmd, true); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } } if (sdp->no_write_same) { rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); return sd_setup_write_same10_cmnd(cmd, false); } static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* * Reporting a maximum number of blocks that is not aligned * on the device physical size would cause a large write same * request to be split into physically unaligned chunks by * __blkdev_issue_write_zeroes() even if the caller of this * functions took care to align the large request. So make sure * the maximum reported is aligned to the device physical block * size. This is only an optional optimization for regular * disks, but this is mandatory to avoid failure of large write * same requests directed at sequential write required zones of * host-managed ZBC disks. */ sdkp->max_ws_blocks = round_down(sdkp->max_ws_blocks, bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } out: lim->max_write_zeroes_sectors = sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); if (sdkp->zeroing_mode == SD_ZERO_WS16_UNMAP || sdkp->zeroing_mode == SD_ZERO_WS10_UNMAP) lim->max_hw_wzeroes_unmap_sectors = lim->max_write_zeroes_sectors; } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); if (cmd->device->use_16_for_sync) { cmd->cmnd[0] = SYNCHRONIZE_CACHE_16; cmd->cmd_len = 16; } else { cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; } cmd->transfersize = 0; cmd->allowed = sdkp->max_retries; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLK_STS_OK; } /** * sd_group_number() - Compute the GROUP NUMBER field * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER * field. * * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf): * 0: no relative lifetime. * 1: shortest relative lifetime. * 2: second shortest relative lifetime. * 3 - 0x3d: intermediate relative lifetimes. * 0x3e: second longest relative lifetime. * 0x3f: longest relative lifetime. */ static u8 sd_group_number(struct scsi_cmnd *cmd) { const struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); if (!sdkp->rscs) return 0; return min3((u32)rq->bio->bi_write_hint, (u32)sdkp->permanent_stream_count, 0x3fu); } static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; cmd->cmnd[11] = dld & 0x07; put_unaligned_be64(lba, &cmd->cmnd[12]); put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */ put_unaligned_be32(nr_blocks, &cmd->cmnd[28]); return BLK_STS_OK; } static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd); cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); return BLK_STS_OK; } static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); return BLK_STS_OK; } static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { /* Avoid that 0 blocks gets translated into 256 blocks. */ if (WARN_ON_ONCE(nr_blocks == 0)) return BLK_STS_IOERR; if (unlikely(flags & 0x8)) { /* * This happens only if this drive failed 10byte rw * command with ILLEGAL_REQUEST during operation and * thus turned off use_10_for_rw. */ scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n"); return BLK_STS_IOERR; } cmd->cmd_len = 6; cmd->cmnd[0] = write ? WRITE_6 : READ_6; cmd->cmnd[1] = (lba >> 16) & 0x1f; cmd->cmnd[2] = (lba >> 8) & 0xff; cmd->cmnd[3] = lba & 0xff; cmd->cmnd[4] = nr_blocks; cmd->cmnd[5] = 0; return BLK_STS_OK; } /* * Check if a command has a duration limit set. If it does, and the target * device supports CDL and the feature is enabled, return the limit * descriptor index to use. Return 0 (no limit) otherwise. */ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) { struct scsi_device *sdp = sdkp->device; int hint; if (!sdp->cdl_supported || !sdp->cdl_enable) return 0; /* * Use "no limit" if the request ioprio does not specify a duration * limit hint. */ hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd))); if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 || hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7) return 0; return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, sector_t lba, unsigned int nr_blocks, bool boundary, unsigned char flags) { cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_ATOMIC_16; cmd->cmnd[1] = flags; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); if (boundary) put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); else put_unaligned_be16(0, &cmd->cmnd[10]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); cmd->cmnd[14] = 0; cmd->cmnd[15] = 0; return BLK_STS_OK; } static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int mask = logical_to_sectors(sdp, 1) - 1; bool write = rq_data_dir(rq) == WRITE; unsigned char protect, fua; unsigned int dld; blk_status_t ret; unsigned int dif; bool dix; ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); goto fail; } /* * Some SD card readers can't handle accesses which touch the * last one or two logical blocks. Split accesses as needed. */ threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS; if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) { if (lba < threshold) { /* Access up to the threshold but not beyond */ nr_blocks = threshold - lba; } else { /* Access only a single logical block */ nr_blocks = 1; } } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); dld = sd_cdl_dld(sdkp, cmd); if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if (rq->cmd_flags & REQ_ATOMIC) { ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, sdkp->use_atomic_write_boundary, protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks, protect | fua); } if (unlikely(ret != BLK_STS_OK)) goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, cmd, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, cmd, "%s %d/%u 512 byte blocks.\n", write ? "writing" : "reading", nr_blocks, blk_rq_sectors(rq))); /* * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; fail: scsi_free_sgtables(cmd); return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, false); case REQ_OP_ZONE_RESET_ALL: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, true); case REQ_OP_ZONE_OPEN: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false); case REQ_OP_ZONE_CLOSE: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false); case REQ_OP_ZONE_FINISH: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = scsi_cmd_to_rq(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); } static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { if (disk_check_media_change(disk)) return true; } /* * Force a full rescan after ioctl(BLKRRPART). While the disk state has * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** * sd_open - open a scsi disk device * @disk: disk to open * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with disk->open_mutex held. **/ static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; if (scsi_device_get(sdev)) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sd_need_revalidate(disk, sdkp)) sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & BLK_OPEN_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_device_put(sdev); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_device_put(sdev); } static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, disk, capacity, diskinfo); else scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, cmd=0x%x\n", disk->disk_name, cmd)); if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO)) return -ENOIOCTLCMD; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = disk->private_data; struct scsi_device *sdp; int retval; bool disk_changed; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries, &sshdr); /* failed to execute TUR, assume media not present */ if (retval < 0 || host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ disk_changed = sdp->changed; sdp->changed = 0; return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } static int sd_sync_cache(struct scsi_disk *sdkp) { int res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; /* Leave the rest of the command zero to indicate flush everything. */ const unsigned char cmd[16] = { sdp->use_16_for_sync ? SYNCHRONIZE_CACHE_16 : SYNCHRONIZE_CACHE }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .allowed = 3, .result = SCMD_FAILURE_RESULT_ANY, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, .sshdr = &sshdr, .failures = &failures, }; if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (res < 0) return res; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ if (sshdr.asc == 0x3a || /* medium not present */ sshdr.asc == 0x20 || /* invalid command */ (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; /* * If a format is in progress or if the drive does not * support sync, there is not much we can do because * this is called during shutdown or suspend so just * return success so those operations can proceed. */ if ((sshdr.asc == 0x04 && sshdr.ascq == 0x04) || sshdr.sense_key == ILLEGAL_REQUEST) return 0; } switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_revalidate_disk(sdkp->disk); } static int sd_get_unique_id(struct gendisk *disk, u8 id[16], enum blk_unique_id type) { struct scsi_device *sdev = scsi_disk(disk)->device; const struct scsi_vpd *vpd; const unsigned char *d; int ret = -ENXIO, len; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg83); if (!vpd) goto out_unlock; ret = -EINVAL; for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { /* we only care about designators with LU association */ if (((d[1] >> 4) & 0x3) != 0x00) continue; if ((d[1] & 0xf) != type) continue; /* * Only exit early if a 16-byte descriptor was found. Otherwise * keep looking as one with more entropy might still show up. */ len = d[3]; if (len != 8 && len != 12 && len != 16) continue; ret = len; memcpy(id, d + 4, len); if (len == 16) break; } out_unlock: rcu_read_unlock(); return ret; } static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result) { switch (host_byte(result)) { case DID_TRANSPORT_MARGINAL: case DID_TRANSPORT_DISRUPTED: case DID_BUS_BUSY: return PR_STS_RETRY_PATH_FAILURE; case DID_NO_CONNECT: return PR_STS_PATH_FAILED; case DID_TRANSPORT_FAILFAST: return PR_STS_PATH_FAST_FAILED; } switch (status_byte(result)) { case SAM_STAT_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; case SAM_STAT_CHECK_CONDITION: if (!scsi_sense_valid(sshdr)) return PR_STS_IOERR; if (sshdr->sense_key == ILLEGAL_REQUEST && (sshdr->asc == 0x26 || sshdr->asc == 0x24)) return -EINVAL; fallthrough; default: return PR_STS_IOERR; } } static int sd_pr_in_command(struct block_device *bdev, u8 sa, unsigned char *data, int data_len) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa }; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; put_unaligned_be16(data_len, &cmd[7]); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info) { int result, i, data_offset, num_copy_keys; u32 num_keys = keys_info->num_keys; int data_len; u8 *data; /* * Each reservation key takes 8 bytes and there is an 8-byte header * before the reservation key list. The total size must fit into the * 16-bit ALLOCATION LENGTH field. */ if (check_mul_overflow(num_keys, 8, &data_len) || check_add_overflow(data_len, 8, &data_len) || data_len > USHRT_MAX) return -EINVAL; data = kzalloc(data_len, GFP_KERNEL); if (!data) return -ENOMEM; result = sd_pr_in_command(bdev, READ_KEYS, data, data_len); if (result) goto free_data; keys_info->generation = get_unaligned_be32(&data[0]); keys_info->num_keys = get_unaligned_be32(&data[4]) / 8; data_offset = 8; num_copy_keys = min(num_keys, keys_info->num_keys); for (i = 0; i < num_copy_keys; i++) { keys_info->keys[i] = get_unaligned_be64(&data[data_offset]); data_offset += 8; } free_data: kfree(data); return result; } static int sd_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; u8 data[24] = { }; int result, len; result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data)); if (result) return result; len = get_unaligned_be32(&data[4]); if (!len) return 0; /* Make sure we have at least the key and type */ if (len < 14) { sdev_printk(KERN_INFO, sdev, "READ RESERVATION failed due to short return buffer of %d bytes\n", len); return -EINVAL; } rsv->generation = get_unaligned_be32(&data[0]); rsv->key = get_unaligned_be64(&data[8]); rsv->type = scsi_pr_type_to_block(data[21] & 0x0f); return 0; } static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data, sizeof(data), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_out_command(bdev, 0x01, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_out_command(bdev, 0x02, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, block_pr_type_to_scsi(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, .pr_read_reservation = sd_pr_read_reservation, }; static void scsi_disk_free_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); put_device(&sdkp->disk_dev); } /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); scsi_print_command(SCpnt); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (!scsi_status_is_check_condition(result) && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { static const u8 cmd[10] = { TEST_UNIT_READY }; unsigned long spintime_expire = 0; int spintime, sense_valid = 0; unsigned int the_result; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, /* Retry when scsi_status_is_good would return false 3 times */ { .result = SCMD_FAILURE_STAT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { bool media_was_present = sdkp->media_present; scsi_failures_reset_retries(&failures); the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { /* * If the drive has indicated to us that it doesn't * have any media in it, don't bother with any more * polling. */ if (media_not_present(sdkp, &sshdr)) { if (media_was_present) sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } sense_valid = scsi_sense_valid(&sshdr); } if (!scsi_status_is_check_condition(the_result)) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x24) break; /* depopulation in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x25) break; /* depopulation restoration in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { /* Return immediately and start spin cycle */ const u8 start_cmd[10] = { [0] = START_STOP, [1] = 1, [4] = sdkp->device->start_stop_pwr_cond ? 0x11 : 1, }; sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); scsi_execute_cmd(sdkp->device, start_cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk(KERN_CONT "."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk(KERN_CONT "ready\n"); else printk(KERN_CONT "not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return 0; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) { sd_printk(KERN_ERR, sdkp, "formatted with unsupported protection type %u. Disabling disk!\n", type); sdkp->protection_type = 0; return -ENODEV; } sdkp->protection_type = type; return 0; } static void sd_config_protection(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", sdkp->protection_type); sdkp->protection_type = 0; } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", sdkp->protection_type); } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (sense_valid) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, RC16_LEN, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { if (media_not_present(sdkp, &sshdr)) return -ENODEV; sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) /* Invalid Command Operation Code or * Invalid Field in CDB, just retry * silently with RC10 */ return -EINVAL; if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(16) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[8]); lba = get_unaligned_be64(&buffer[0]); if (sd_read_protection_type(sdkp, buffer) < 0) { sdkp->capacity = 0; return -ENODEV; } /* Logical blocks per physical block exponent */ sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; /* RC basis */ sdkp->rc_basis = (buffer[12] >> 4) & 0x3; /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); if (buffer[14] & 0x80) { /* LBPME */ sdkp->lbpme = 1; if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; } sdkp->capacity = lba + 1; return sector_size; } static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { static const u8 cmd[10] = { READ_CAPACITY }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, /* Device reset might occur several times so retry a lot */ { .sense = UNIT_ATTENTION, .asc = 0x29, .allowed = READ_CAPACITY_RETRIES_ON_RESET, .result = SAM_STAT_CHECK_CONDITION, }, /* Any other error not listed above retry 3 times */ { .result = SCMD_FAILURE_RESULT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int sense_valid = 0; int the_result; sector_t lba; unsigned sector_size; memset(buffer, 0, 8); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, 8, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { sense_valid = scsi_sense_valid(&sshdr); if (media_not_present(sdkp, &sshdr)) return -ENODEV; } if (the_result) { sd_print_result(sdkp, "Read Capacity(10) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[4]); lba = get_unaligned_be32(&buffer[0]); if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) { /* Some buggy (usb cardreader) devices return an lba of 0xffffffff when the want to report a size of 0 (with which they really mean no media is present) */ sdkp->capacity = 0; sdkp->physical_block_size = sector_size; return sector_size; } sdkp->capacity = lba + 1; sdkp->physical_block_size = sector_size; return sector_size; } static int sd_try_rc16_first(struct scsi_device *sdp) { if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) return 0; if (sdp->scsi_level > SCSI_SPC_2) return 1; if (scsi_device_protection(sdp)) return 1; return 0; } /* * read disk capacity */ static void sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) return; if (sector_size < 0) sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size < 0) return; } else { sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size < 0) return; if ((sizeof(sdkp->capacity) > 4) && (sdkp->capacity > 0xffffffffULL)) { int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. Trying to use READ CAPACITY(16).\n"); sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); sdkp->capacity = 1 + (sector_t) 0xffffffff; sector_size = old_sector_size; goto got_data; } /* Remember that READ CAPACITY(16) succeeded */ sdp->try_rc_10_first = 0; } } /* Some devices are known to return the total number of blocks, * not the highest block number. Some devices have versions * which do this and others which do not. Some devices we might * suspect of doing this but we don't know for certain. * * If we know the reported capacity is wrong, decrement it. If * we can only guess, then assume the number of blocks is even * (usually true but not always) and err on the side of lowering * the capacity. */ if (sdp->fix_capacity || (sdp->guess_capacity && (sdkp->capacity & 0x01))) { sd_printk(KERN_INFO, sdkp, "Adjusting the sector count from its reported value: %llu\n", (unsigned long long) sdkp->capacity); --sdkp->capacity; } got_data: if (sector_size == 0) { sector_size = 512; sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, assuming 512.\n"); } if (sector_size != 512 && sector_size != 1024 && sector_size != 2048 && sector_size != 4096) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* * The user might want to re-format the drive with * a supported sectorsize. Once this happens, it * would be relatively trivial to set the thing up. * For this reason, we leave the thing in the table. */ sdkp->capacity = 0; /* * set a bogus sector size so the normal read/write * logic in the block layer will eventually refuse any * request on this device without tripping over power * of two sector size assumptions */ sector_size = 512; } lim->logical_block_size = sector_size; lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; } /* * Print disk capacity */ static void sd_print_capacity(struct scsi_disk *sdkp, sector_t old_capacity) { int sector_size = sdkp->device->sector_size; char cap_str_2[10], cap_str_10[10]; if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; string_get_size(sdkp->capacity, sector_size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); string_get_size(sdkp->capacity, sector_size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", (unsigned long long)sdkp->capacity, sector_size, cap_str_10, cap_str_2); if (sdkp->physical_block_size != sector_size) sd_printk(KERN_NOTICE, sdkp, "%u-byte physical blocks\n", sdkp->physical_block_size); } /* called with buffer of length 512 */ static inline int sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage, unsigned char *buffer, int len, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { /* * If we must use MODE SENSE(10), make sure that the buffer length * is at least 8 bytes so that the mode sense header fits. */ if (sdkp->device->use_10_for_ms && len < 8) len = 8; return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len, SD_TIMEOUT, sdkp->max_retries, data, sshdr); } /* * read write protect setting, if possible - called only in sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer) { int res; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; int old_wp = sdkp->write_prot; set_disk_ro(sdkp->disk, 0); if (sdp->skip_ms_page_3f) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n"); return; } if (sdp->use_192_bytes_for_3f) { res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL); } else { /* * First attempt: ask for all pages (0x3F), but only 4 bytes. * We have to start carefully: some devices hang if we ask * for more than is available. */ res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL); /* * Second attempt: ask for page 0 When only page 0 is * implemented, a request for page 3F may return Sense Key * 5: Illegal Request, Sense Code 24: Invalid field in * CDB. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL); /* * Third attempt: ask 255 bytes, as we did earlier. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255, &data, NULL); } if (res < 0) { sd_first_printk(KERN_WARNING, sdkp, "Test WP failed, assume Write Enabled\n"); } else { sdkp->write_prot = ((data.device_specific & 0x80) != 0); set_disk_ro(sdkp->disk, sdkp->write_prot); if (sdkp->first_scan || old_wp != sdkp->write_prot) { sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n", sdkp->write_prot ? "on" : "off"); sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer); } } } /* * sd_read_cache_type - called only from sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer) { int len = 0, res; struct scsi_device *sdp = sdkp->device; int dbd; int modepage; int first_len; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; int old_wce = sdkp->WCE; int old_rcd = sdkp->RCD; int old_dpofua = sdkp->DPOFUA; if (sdkp->cache_override) return; first_len = 4; if (sdp->skip_ms_page_8) { if (sdp->type == TYPE_RBC) goto defaults; else { if (sdp->skip_ms_page_3f) goto defaults; modepage = 0x3F; if (sdp->use_192_bytes_for_3f) first_len = 192; dbd = 0; } } else if (sdp->type == TYPE_RBC) { modepage = 6; dbd = 8; } else { modepage = 8; dbd = 0; } /* cautiously ask */ res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len, &data, &sshdr); if (res < 0) goto bad_sense; if (!data.header_length) { modepage = 6; first_len = 0; sd_first_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n"); } /* that went OK, now ask for the proper length */ len = data.length; /* * We're only interested in the first three bytes, actually. * But the data cache page is defined for the first 20. */ if (len < 3) goto bad_sense; else if (len > SD_BUF_SIZE) { sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter data from %d to %d bytes\n", len, SD_BUF_SIZE); len = SD_BUF_SIZE; } if (modepage == 0x3F && sdp->use_192_bytes_for_3f) len = 192; /* Get the data */ if (len > first_len) res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len, &data, &sshdr); if (!res) { int offset = data.header_length + data.block_descriptor_length; while (offset < len) { u8 page_code = buffer[offset] & 0x3F; u8 spf = buffer[offset] & 0x40; if (page_code == 8 || page_code == 6) { /* We're interested only in the first 3 bytes. */ if (len - offset <= 2) { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter data\n"); goto defaults; } else { modepage = page_code; goto Page_found; } } else { /* Go to the next page */ if (spf && len - offset > 3) offset += 4 + (buffer[offset+2] << 8) + buffer[offset+3]; else if (!spf && len - offset > 1) offset += 2 + buffer[offset+1]; else { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter data\n"); goto defaults; } } } sd_first_printk(KERN_WARNING, sdkp, "No Caching mode page found\n"); goto defaults; Page_found: if (modepage == 8) { sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0); sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0); } else { sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0); sdkp->RCD = 0; } sdkp->DPOFUA = (data.device_specific & 0x10) != 0; if (sdp->broken_fua) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n"); sdkp->DPOFUA = 0; } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw && !sdkp->device->use_16_for_rw) { sd_first_printk(KERN_NOTICE, sdkp, "Uses READ/WRITE(6), disabling FUA\n"); sdkp->DPOFUA = 0; } /* No cache flush allowed for write protected devices */ if (sdkp->WCE && sdkp->write_prot) sdkp->WCE = 0; if (sdkp->first_scan || old_wce != sdkp->WCE || old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA) sd_printk(KERN_NOTICE, sdkp, "Write cache: %s, read cache: %s, %s\n", sdkp->WCE ? "enabled" : "disabled", sdkp->RCD ? "disabled" : "enabled", sdkp->DPOFUA ? "supports DPO and FUA" : "doesn't support DPO or FUA"); return; } bad_sense: if (res == -EIO && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && sshdr.asc == 0x24 && sshdr.ascq == 0x0) /* Invalid field in CDB */ sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n"); else sd_first_printk(KERN_ERR, sdkp, "Asking for cache data failed\n"); defaults: if (sdp->wce_default_on) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n"); sdkp->WCE = 1; } else { sd_first_printk(KERN_WARNING, sdkp, "Assuming drive cache: write through\n"); sdkp->WCE = 0; } sdkp->RCD = 0; sdkp->DPOFUA = 0; } static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id) { u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS }; struct { struct scsi_stream_status_header h; struct scsi_stream_status s; } buf; struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int res; put_unaligned_be16(stream_id, &cdb[4]); put_unaligned_be32(sizeof(buf), &cdb[10]); res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res < 0) return false; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); if (res) return false; if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status)) return false; return buf.s.perm; } static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; const struct scsi_io_group_descriptor *desc, *start, *end; u16 permanent_stream_count_old; struct scsi_sense_hdr sshdr; struct scsi_mode_data data; int res; if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) return; res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0) return; start = (void *)buffer + data.header_length + 16; end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length, sizeof(*end)); /* * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs * should assign the lowest numbered stream identifiers to permanent * streams. */ for (desc = start; desc < end; desc++) if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) break; permanent_stream_count_old = sdkp->permanent_stream_count; sdkp->permanent_stream_count = desc - start; if (sdkp->rscs && sdkp->permanent_stream_count < 2) sd_printk(KERN_INFO, sdkp, "Unexpected: RSCS has been set and the permanent stream count is %u\n", sdkp->permanent_stream_count); else if (sdkp->permanent_stream_count != permanent_stream_count_old) sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", sdkp->permanent_stream_count); } /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. */ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) { int res, offset; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return; if (sdkp->protection_type == 0) return; res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0 || !data.header_length || data.length < 6) { sd_first_printk(KERN_WARNING, sdkp, "getting Control mode page failed, assume no ATO\n"); if (res == -EIO && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return; } offset = data.header_length + data.block_descriptor_length; if ((buffer[offset] & 0x3f) != 0x0a) { sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n"); return; } if ((buffer[offset + 5] & 0x80) == 0) return; sdkp->ATO = 1; return; } static unsigned int sd_discard_mode(struct scsi_disk *sdkp) { if (!sdkp->lbpme) return SD_LBP_FULL; if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) return SD_LBP_UNMAP; return SD_LBP_WS16; } /* LBP VPD page tells us what to use */ if (sdkp->lbpu && sdkp->max_unmap_blocks) return SD_LBP_UNMAP; if (sdkp->lbpws) return SD_LBP_WS16; if (sdkp->lbpws10) return SD_LBP_WS10; return SD_LBP_DISABLE; } /* * Query disk device for preferred I/O sizes. */ static void sd_read_block_limits(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb0); if (!vpd || vpd->len < 16) goto out; sdkp->min_xfer_blocks = get_unaligned_be16(&vpd->data[6]); sdkp->max_xfer_blocks = get_unaligned_be32(&vpd->data[8]); sdkp->opt_xfer_blocks = get_unaligned_be32(&vpd->data[12]); if (vpd->len >= 64) { unsigned int lba_count, desc_count; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); if (lba_count && desc_count) sdkp->max_unmap_blocks = lba_count; sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]); if (vpd->data[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); config_atomic: sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); sd_config_atomic(sdkp, lim); } out: rcu_read_unlock(); } /* Parse the Block Limits Extension VPD page (0xb7) */ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb7); if (vpd && vpd->len >= 6) sdkp->rscs = vpd->data[5] & 1; rcu_read_unlock(); } /* Query block device characteristics */ static void sd_read_block_characteristics(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; u16 rot; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb1); if (!vpd || vpd->len <= 8) { rcu_read_unlock(); return; } rot = get_unaligned_be16(&vpd->data[4]); sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); if (rot == 1) lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); else if (sdkp->zoned == 2) sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n"); } /** * sd_read_block_provisioning - Query provisioning VPD page * @sdkp: disk to query */ static void sd_read_block_provisioning(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; if (sdkp->lbpme == 0) return; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb2); if (!vpd || vpd->len < 8) { rcu_read_unlock(); return; } sdkp->lbpvpd = 1; sdkp->lbpu = (vpd->data[5] >> 7) & 1; /* UNMAP */ sdkp->lbpws = (vpd->data[5] >> 6) & 1; /* WRITE SAME(16) w/ UNMAP */ sdkp->lbpws10 = (vpd->data[5] >> 5) & 1; /* WRITE SAME(10) w/ UNMAP */ rcu_read_unlock(); } static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (sdev->host->no_write_same) { sdev->no_write_same = 1; return; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) { sdev->no_report_opcodes = 1; /* * Disable WRITE SAME if REPORT SUPPORTED OPERATION CODES is * unsupported and this is an ATA device. */ if (sdev->is_ata) sdev->no_write_same = 1; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1) sdkp->ws16 = 1; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME, 0) == 1) sdkp->ws10 = 1; } static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (!sdev->security_supported) return; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_IN, 0) == 1 && scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_OUT, 0) == 1) sdkp->security = 1; } static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf) { return logical_to_sectors(sdkp->device, get_unaligned_be64(buf)); } /** * sd_read_cpr - Query concurrent positioning ranges * @sdkp: disk to query */ static void sd_read_cpr(struct scsi_disk *sdkp) { struct blk_independent_access_ranges *iars = NULL; unsigned char *buffer = NULL; unsigned int nr_cpr = 0; int i, vpd_len, buf_len = SD_BUF_SIZE; u8 *desc; /* * We need to have the capacity set first for the block layer to be * able to check the ranges. */ if (sdkp->first_scan) return; if (!sdkp->capacity) goto out; /* * Concurrent Positioning Ranges VPD: there can be at most 256 ranges, * leading to a maximum page size of 64 + 256*32 bytes. */ buf_len = 64 + 256*32; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len)) goto out; /* We must have at least a 64B header and one 32B range descriptor */ vpd_len = get_unaligned_be16(&buffer[2]) + 4; if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Ranges VPD page\n"); goto out; } nr_cpr = (vpd_len - 64) / 32; if (nr_cpr == 1) { nr_cpr = 0; goto out; } iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr); if (!iars) { nr_cpr = 0; goto out; } desc = &buffer[64]; for (i = 0; i < nr_cpr; i++, desc += 32) { if (desc[0] != i) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Range number\n"); nr_cpr = 0; break; } iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8); iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16); } out: disk_set_independent_access_ranges(sdkp->disk, iars); if (nr_cpr && sdkp->nr_actuators != nr_cpr) { sd_printk(KERN_NOTICE, sdkp, "%u concurrent positioning ranges\n", nr_cpr); sdkp->nr_actuators = nr_cpr; } kfree(buffer); } static bool sd_validate_min_xfer_size(struct scsi_disk *sdkp) { struct scsi_device *sdp = sdkp->device; unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->min_xfer_blocks == 0) return false; if (min_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Preferred minimum I/O size %u bytes not a multiple of physical block size (%u bytes)\n", min_xfer_bytes, sdkp->physical_block_size); sdkp->min_xfer_blocks = 0; return false; } sd_first_printk(KERN_INFO, sdkp, "Preferred minimum I/O size %u bytes\n", min_xfer_bytes); return true; } /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a * multiple of the physical block size, or simply garbage. */ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, unsigned int dev_max) { struct scsi_device *sdp = sdkp->device; unsigned int opt_xfer_bytes = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->opt_xfer_blocks == 0) return false; if (sdkp->opt_xfer_blocks > dev_max) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks > dev_max (%u logical blocks)\n", sdkp->opt_xfer_blocks, dev_max); return false; } if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks > sd driver limit (%u logical blocks)\n", sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS); return false; } if (opt_xfer_bytes < PAGE_SIZE) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes < PAGE_SIZE (%u bytes)\n", opt_xfer_bytes, (unsigned int)PAGE_SIZE); return false; } if (min_xfer_bytes && opt_xfer_bytes % min_xfer_bytes) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a multiple of preferred minimum block size (%u bytes)\n", opt_xfer_bytes, min_xfer_bytes); return false; } if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a multiple of physical block size (%u bytes)\n", opt_xfer_bytes, sdkp->physical_block_size); return false; } sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n", opt_xfer_bytes); return true; } static void sd_read_block_zero(struct scsi_disk *sdkp) { struct scsi_device *sdev = sdkp->device; unsigned int buf_len = sdev->sector_size; u8 *buffer, cmd[16] = { }; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer) return; if (sdev->use_16_for_rw) { cmd[0] = READ_16; put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */ } else { cmd[0] = READ_10; put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ } scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, SD_TIMEOUT, sdkp->max_retries, NULL); kfree(buffer); } /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ static void sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; sector_t old_capacity = sdkp->capacity; struct queue_limits *lim = NULL; unsigned char *buffer = NULL; unsigned int dev_max; int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); /* * If the device is offline, don't try and read capacity or any * of the other niceties. */ if (!scsi_device_online(sdp)) return; lim = kmalloc_obj(*lim); if (!lim) return; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); if (!buffer) goto out; sd_spinup_disk(sdkp); *lim = queue_limits_start_update(sdkp->disk->queue); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { sd_read_capacity(sdkp, lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation * to force the device to populate mode pages. */ if (sdp->read_before_ms) sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ lim->features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp, lim); sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp, lim); sd_zbc_read_zones(sdkp, lim, buffer); } sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_io_hints(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp, lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ sd_set_flush_flag(sdkp, lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); lim->max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) lim->io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else lim->io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ lim->io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { lim->io_opt = min_not_zero(lim->io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp, lim); err = queue_limits_commit_update_frozen(sdkp->disk->queue, lim); if (err) goto out; /* * Query concurrent positioning ranges after * queue_limits_commit_update() unlocked q->limits_lock to avoid * deadlock with q->sysfs_dir_lock and q->sysfs_lock. */ if (sdkp->media_present && scsi_device_supports_vpd(sdp)) sd_read_cpr(sdkp); /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) set_capacity_and_notify(disk, 0); out: kfree(buffer); kfree(lim); } /** * sd_unlock_native_capacity - unlock native capacity * @disk: struct gendisk to set capacity for * * Block layer calls this function if it detects that partitions * on @disk reach beyond the end of the device. If the SCSI host * implements ->unlock_native_capacity() method, it's invoked to * give it a chance to adjust the device capacity. * * CONTEXT: * Defined by block layer. Might sleep. */ static void sd_unlock_native_capacity(struct gendisk *disk) { struct scsi_device *sdev = scsi_disk(disk)->device; if (sdev->host->hostt->unlock_native_capacity) sdev->host->hostt->unlock_native_capacity(sdev); } static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, .get_unique_id = sd_get_unique_id, .free_disk = scsi_disk_free_disk, .pr_ops = &sd_pr_ops, }; /** * sd_format_disk_name - format disk name * @prefix: name prefix - ie. "sd" for SCSI disks * @index: index of the disk to format name for * @buf: output buffer * @buflen: length of the output buffer * * SCSI disk names starts at sda. The 26th device is sdz and the * 27th is sdaa. The last one for two lettered suffix is sdzz * which is followed by sdaaa. * * This is basically 26 base counting with one extra 'nil' entry * at the beginning from the second digit on and can be * determined using similar method as 26 base conversion with the * index shifted -1 after each digit is computed. * * CONTEXT: * Don't care. * * RETURNS: * 0 on success, -errno on failure. */ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) { const int base = 'z' - 'a' + 1; char *begin = buf + strlen(prefix); char *end = buf + buflen; char *p; int unit; p = end - 1; *p = '\0'; unit = base; do { if (p == begin) return -EINVAL; *--p = 'a' + (index % unit); index = (index / unit) - 1; } while (index >= 0); memmove(begin, p, end - p); memcpy(buf, prefix, strlen(prefix)); return 0; } /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once * for each scsi device (not just disks) present. * @sdp: pointer to device object * * Returns 0 if successful (or not interested in this scsi device * (e.g. scanner)); 1 when there is an error. * * Note: this function is invoked from the scsi mid-level. * This function sets up the mapping between a given * <host,channel,id,lun> (found in sdp) and new device name * (e.g. /dev/sda). More precisely it is the block device major * and minor number that is chosen here. * * Assume sd_probe is not re-entrant (for time being) * Also think about sd_probe() and sd_remove() running coincidentally. **/ static int sd_probe(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct scsi_disk *sdkp; struct gendisk *gd; int index; int error; scsi_autopm_get_device(sdp); error = -ENODEV; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC) goto out; if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) { sdev_printk(KERN_WARNING, sdp, "Unsupported ZBC host-managed device.\n"); goto out; } SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp, "sd_probe\n")); error = -ENOMEM; sdkp = kzalloc_obj(*sdkp); if (!sdkp) goto out; gd = blk_mq_alloc_disk_for_queue(sdp->request_queue, &sd_bio_compl_lkclass); if (!gd) goto out_free; index = ida_alloc(&sd_index_ida, GFP_KERNEL); if (index < 0) { sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n"); goto out_put; } error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN); if (error) { sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n"); goto out_free_index; } sdkp->device = sdp; sdkp->disk = gd; sdkp->index = index; sdkp->max_retries = SD_MAX_RETRIES; atomic_set(&sdkp->openers, 0); atomic_set(&sdkp->device->ioerr_cnt, 0); if (!sdp->request_queue->rq_timeout) { if (sdp->type != TYPE_MOD) blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT); else blk_queue_rq_timeout(sdp->request_queue, SD_MOD_TIMEOUT); } device_initialize(&sdkp->disk_dev); sdkp->disk_dev.parent = get_device(dev); sdkp->disk_dev.class = &sd_disk_class; dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev)); error = device_add(&sdkp->disk_dev); if (error) { put_device(&sdkp->disk_dev); goto out; } dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = sdkp; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; sdkp->capacity = 0; sdkp->media_present = 1; sdkp->write_prot = 0; sdkp->cache_override = 0; sdkp->WCE = 0; sdkp->RCD = 0; sdkp->ATO = 0; sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; } blk_pm_runtime_init(sdp->request_queue, dev); if (sdp->rpm_autosuspend) { pm_runtime_set_autosuspend_delay(dev, sdp->host->rpm_autosuspend_delay); } error = device_add_disk(dev, gd, NULL); if (error) { device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } if (sdkp->security) { sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit); if (sdkp->opal_dev) sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n"); } sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n", sdp->removable ? "removable " : ""); scsi_autopm_put_device(sdp); return 0; out_free_index: ida_free(&sd_index_ida, index); out_put: put_disk(gd); out_free: kfree(sdkp); out: scsi_autopm_put_device(sdp); return error; } static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { /* Power on, reset, or bus device reset occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 0, .result = SAM_STAT_CHECK_CONDITION, }, { /* Power on occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 1, .result = SAM_STAT_CHECK_CONDITION, }, { /* SCSI bus reset */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 2, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .total_allowed = 3, .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .req_flags = BLK_MQ_REQ_PM, .failures = &failures, }; struct scsi_device *sdp = sdkp->device; int res; if (start) cmd[4] |= 1; /* START */ if (sdp->start_stop_pwr_cond) cmd[4] |= start ? 1 << 4 : 3 << 4; /* Active or Standby */ if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (res > 0 && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* 0x3a is medium not present */ if (sshdr.asc == 0x3a) res = 0; } } /* SCSI error codes must not go to the generic layer */ if (res) return -EIO; return 0; } /* * Send a SYNCHRONIZE CACHE instruction down to the device through * the normal SCSI command structure. Wait for the command to * complete. */ static void sd_shutdown(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); sd_sync_cache(sdkp); } if ((system_state != SYSTEM_RESTART && sdkp->device->manage_system_start_stop) || (system_state == SYSTEM_POWER_OFF && sdkp->device->manage_shutdown) || (system_state == SYSTEM_RUNNING && sdkp->device->manage_runtime_start_stop) || (system_state == SYSTEM_RESTART && sdkp->device->manage_restart)) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } } /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially * multiple times) during sd module unload. * @sdp: pointer to device object * * Note: this function is invoked from the scsi mid-level. * This function potentially frees up a device name (e.g. /dev/sdc) * that could be re-used by a subsequent sd_probe(). * This function is not called when the built-in sd driver is "exit-ed". **/ static void sd_remove(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct scsi_disk *sdkp = dev_get_drvdata(dev); scsi_autopm_get_device(sdkp->device); device_del(&sdkp->disk_dev); del_gendisk(sdkp->disk); if (!sdkp->suspended) sd_shutdown(sdp); put_disk(sdkp->disk); } static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime) { return (sdev->manage_system_start_stop && !runtime) || (sdev->manage_runtime_start_stop && runtime); } static int sd_suspend_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ return 0; if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp); /* ignore OFFLINE device */ if (ret == -ENODEV) return 0; if (ret) return ret; } if (sd_do_start_stop(sdkp->device, runtime)) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (!runtime) ret = 0; } if (!ret) sdkp->suspended = true; return ret; } static int sd_suspend_system(struct device *dev) { if (pm_runtime_suspended(dev)) return 0; return sd_suspend_common(dev, false); } static int sd_suspend_runtime(struct device *dev) { return sd_suspend_common(dev, true); } static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; } return 0; } static int sd_resume_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); sdkp->suspended = false; } return ret; } static int sd_resume_system(struct device *dev) { if (pm_runtime_suspended(dev)) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp = sdkp ? sdkp->device : NULL; if (sdp && sdp->force_runtime_start_on_system_start) pm_request_resume(dev); return 0; } return sd_resume_common(dev, false); } static int sd_resume_runtime(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; sdp = sdkp->device; if (sdp->ignore_media_change) { /* clear the device's sense data */ static const u8 cmd[10] = { REQUEST_SENSE }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; if (scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, sdp->request_queue->rq_timeout, 1, &exec_args)) sd_printk(KERN_NOTICE, sdkp, "Failed to clear sense data\n"); } return sd_resume_common(dev, true); } static const struct dev_pm_ops sd_pm_ops = { .suspend = sd_suspend_system, .resume = sd_resume_system, .poweroff = sd_suspend_system, .restore = sd_resume_system, .runtime_suspend = sd_suspend_runtime, .runtime_resume = sd_resume_runtime, }; static struct scsi_driver sd_template = { .probe = sd_probe, .remove = sd_remove, .shutdown = sd_shutdown, .gendrv = { .name = "sd", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sd_pm_ops, }, .rescan = sd_rescan, .resume = sd_resume, .init_command = sd_init_command, .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, .eh_reset = sd_eh_reset, }; /** * init_sd - entry point for this driver (both when built in or when * a module). * * Note: this function registers this driver with the scsi mid-level. **/ static int __init init_sd(void) { int majors = 0, i, err; SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { if (__register_blkdev(sd_major(i), "sd", sd_default_probe)) continue; majors++; } if (!majors) return -ENODEV; err = class_register(&sd_disk_class); if (err) goto err_out; sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; goto err_out_class; } err = scsi_register_driver(&sd_template); if (err) goto err_out_driver; return 0; err_out_driver: mempool_destroy(sd_page_pool); err_out_class: class_unregister(&sd_disk_class); err_out: for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); return err; } /** * exit_sd - exit point for this driver (when it is a module). * * Note: this function unregisters this driver from the scsi mid-level. **/ static void __exit exit_sd(void) { int i; SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); scsi_unregister_driver(&sd_template); mempool_destroy(sd_page_pool); class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); } module_init(init_sd); module_exit(exit_sd); void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { scsi_print_sense_hdr(sdkp->device, sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr); } void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result) { const char *hb_string = scsi_hostbyte_string(result); if (hb_string) sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=%s driverbyte=%s\n", msg, hb_string ? hb_string : "invalid", "DRIVER_OK"); else sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=0x%02x driverbyte=%s\n", msg, host_byte(result), "DRIVER_OK"); }
72 72 72 17 17 17 1 2 9 12 21 15 15 10 3 9 7 7 7 16 15 3 1 14 16 15 55 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here * by Arnaldo Carvalho de Melo <acme@mandriva.com> */ #include <linux/module.h> #include <linux/random.h> #include <net/addrconf.h> #include <net/hotdata.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> #include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); return lport + __inet6_ehashfn(lhash, 0, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif) { const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const struct hlist_nulls_node *node; struct inet_ehash_bucket *head; struct inet_hashinfo *hashinfo; unsigned int hash, slot; struct sock *sk; hashinfo = net->ipv4.tcp_death_row.hashinfo; hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); slot = hash & hashinfo->ehash_mask; head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && READ_ONCE(inet_sk(sk)->inet_num) == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); if (result) goto done; } hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with in6addr_any */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, &in6addr_any, hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp, bool rcu_lookup, u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_timewait_sock *tw = NULL; const struct hlist_nulls_node *node; struct sock *sk2; spinlock_t *lock; if (rcu_lookup) { sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash || !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif)) continue; if (sk2->sk_state == TCP_TIME_WAIT) break; return -EADDRNOTAVAIL; } return 0; } lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_dport); } int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const struct inet_sock *inet = inet_sk(sk); const struct net *net = sock_net(sk); u64 port_offset = 0; u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect);
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static const struct genl_ops ila_nl_ops[] = { { .cmd = ILA_CMD_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_add_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_del_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_flush, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_get_mapping, .start = ila_xlat_nl_dump_start, .dumpit = ila_xlat_nl_dump, .done = ila_xlat_nl_dump_done, }, }; unsigned int ila_net_id; struct genl_family ila_nl_family __ro_after_init = { .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, .maxattr = ILA_ATTR_MAX, .policy = ila_nl_policy, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, .ops = ila_nl_ops, .n_ops = ARRAY_SIZE(ila_nl_ops), .resv_start_op = ILA_CMD_FLUSH + 1, }; static __net_init int ila_init_net(struct net *net) { int err; err = ila_xlat_init_net(net); if (err) goto ila_xlat_init_fail; return 0; ila_xlat_init_fail: return err; } static __net_exit void ila_pre_exit_net(struct net *net) { ila_xlat_pre_exit_net(net); } static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); } static struct pernet_operations ila_net_ops = { .init = ila_init_net, .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), }; static int __init ila_init(void) { int ret; ret = register_pernet_device(&ila_net_ops); if (ret) goto register_device_fail; ret = genl_register_family(&ila_nl_family); if (ret) goto register_family_fail; ret = ila_lwt_init(); if (ret) goto fail_lwt; return 0; fail_lwt: genl_unregister_family(&ila_nl_family); register_family_fail: unregister_pernet_device(&ila_net_ops); register_device_fail: return ret; } static void __exit ila_fini(void) { ila_lwt_fini(); genl_unregister_family(&ila_nl_family); unregister_pernet_device(&ila_net_ops); } module_init(ila_init); module_exit(ila_fini); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the Netfilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms * and others. * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs * Harald Welte don't use nfcache */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/sctp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <net/ip.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/gue.h> #include <net/gre.h> #include <net/route.h> #include <net/ip6_checksum.h> #include <net/netns/generic.h> /* net_generic() */ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/ip6_route.h> #endif #include <net/ip_vs.h> #include <linux/indirect_call_wrapper.h> EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); EXPORT_SYMBOL(ip_vs_conn_out_get); #ifdef CONFIG_IP_VS_PROTO_TCP EXPORT_SYMBOL(ip_vs_tcp_conn_listen); #endif EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_TCP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) #elif defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) #else #define SNAT_CALL(f, ...) f(__VA_ARGS__) #endif static unsigned int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; switch (proto) { case IPPROTO_IP: return "IP"; case IPPROTO_UDP: return "UDP"; case IPPROTO_TCP: return "TCP"; case IPPROTO_SCTP: return "SCTP"; case IPPROTO_ICMP: return "ICMP"; #ifdef CONFIG_IP_VS_IPV6 case IPPROTO_ICMPV6: return "ICMPv6"; #endif default: sprintf(buf, "IP_%u", proto); return buf; } } void ip_vs_init_hash_table(struct list_head *table, int rows) { while (--rows >= 0) INIT_LIST_HEAD(&table[rows]); } static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.inpkts); u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; struct netns_ipvs *ipvs = cp->ipvs; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; local_bh_disable(); s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.outpkts); u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); } } static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; local_bh_disable(); s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); } static inline void ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (likely(pd->pp->state_transition)) pd->pp->state_transition(cp, direction, skb, pd); } static inline int ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = rcu_dereference(svc->pe); if (p->pe && p->pe->fill_param) return p->pe->fill_param(p, skb); return 0; } /* * IPVS persistent scheduling function * It creates a connection entry according to its template if exists, * or selects a server and creates a connection entry plus a template. * Locking: we are svc user (svc->refcnt), so we hold all dests too * Protocols supported: TCP, UDP */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, __be16 src_port, __be16 dst_port, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp = NULL; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ const union nf_inet_addr *src_addr, *dst_addr; if (likely(!ip_vs_iph_inverse(iph))) { src_addr = &iph->saddr; dst_addr = &iph->daddr; } else { src_addr = &iph->daddr; dst_addr = &iph->saddr; } /* Mask saddr with the netmask to adjust template granularity */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &src_addr->in6, (__force __u32) svc->netmask); else #endif snet.ip = src_addr->ip & svc->netmask; IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* * As far as we know, FTP is a very complicated network protocol, and * it uses control connection and data connections. For active FTP, * FTP server initialize data connection to the client, its source port * is often 20. For passive FTP, FTP server tells the clients the port * that it passively listens to, and the client issues the data * connection. In the tunneling or direct routing mode, the load * balancer is on the client-to-server half of connection, the port * number is unknown to the load balancer. So, a conn template like * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * is created for other persistent services. */ { int protocol = iph->protocol; const union nf_inet_addr *vaddr = dst_addr; __be16 vport = 0; if (dst_port == svc->port) { /* non-FTP template: * <protocol, caddr, 0, vaddr, vport, daddr, dport> * FTP template: * <protocol, caddr, 0, vaddr, 0, daddr, 0> */ if (svc->port != FTPPORT) vport = dst_port; } else { /* Note: persistent fwmark-based services and * persistent port zero service are handled here. * fwmark template: * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> * port zero template: * <protocol,caddr,0,vaddr,0,daddr,0> */ if (svc->fwmark) { protocol = IPPROTO_IP; vaddr = &fwmark; } } /* return *ignored = -1 so NF_DROP can be used */ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, vaddr, vport, &param) < 0) { *ignored = -1; return NULL; } } /* Check if a template already exists */ ct = ip_vs_ct_in_get(&param); if (!ct || !ip_vs_check_template(ct, NULL)) { struct ip_vs_scheduler *sched; /* * No template found or the dest of the connection * template is not available. * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); *ignored = 0; return NULL; } if (dst_port == svc->port && svc->port != FTPPORT) dport = dest->port; /* Create a template * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); *ignored = -1; return NULL; } ct->timeout = svc->timeout; } else { /* set destination with the found template */ dest = ct->dest; kfree(param.pe_data); } dport = dst_port; if (dport == svc->port && dest->port) dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a new connection according to the template */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, src_port, dst_addr, dst_port, &param); cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; return NULL; } /* * Add its control */ ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); ip_vs_conn_stats(cp, svc); return cp; } /* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP * * Usage of *ignored * * 1 : protocol tried to schedule (eg. on SYN), found svc but the * svc/scheduler decides that this packet should be accepted with * NF_ACCEPT because it must not be scheduled. * * 0 : scheduler can not find destination, so try bypass or * return ICMP and then NF_DROP (ip_vs_leave). * * -1 : scheduler tried to schedule but fatal error occurred, eg. * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param * failure such as missing Call-ID, ENOMEM on skb_linearize * or pe_data. In this case we should return NF_DROP without * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr, cport, vport; const void *caddr, *vaddr; unsigned int flags; *ignored = 1; /* * IPv6 frags, only the first hit here. */ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; if (likely(!ip_vs_iph_inverse(iph))) { cport = pptr[0]; caddr = &iph->saddr; vport = pptr[1]; vaddr = &iph->daddr; } else { cport = pptr[1]; caddr = &iph->daddr; vport = pptr[0]; vaddr = &iph->saddr; } /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ if (cport == FTPDATA) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } /* * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { iph->hdr_flags ^= IP_VS_HDR_INVERSE; cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, svc->ipvs, svc->af, skb, iph); iph->hdr_flags ^= IP_VS_HDR_INVERSE; if (cp) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling reply for existing" " connection"); __ip_vs_conn_put(cp); return NULL; } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; /* * Non-persistent service */ if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a connection entry. */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; return NULL; } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; } static inline int ip_vs_addr_is_unicast(struct net *net, int af, union nf_inet_addr *addr) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; #endif return (inet_addr_type(net, addr->ip) == RTN_UNICAST); } /* * Pass or drop the packet. * Called by ip_vs_in, when the virtual service is available but * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) { __be16 _ports[2], *pptr, dport; struct netns_ipvs *ipvs = svc->ipvs; struct net *net = ipvs->net; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NF_DROP; dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ if (sysctl_cache_bypass(ipvs) && svc->fwmark && !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { int ret; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) return NF_DROP; } /* statistics */ ip_vs_in_stats(cp, skb); /* set state */ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) atomic_inc(&cp->control->in_pkts); else atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } /* * When the virtual ftp service is presented, packets destined * for other services on the VIP may get here (except services * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ if (svc->port == FTPPORT && dport != FTPPORT) return NF_ACCEPT; if (unlikely(ip_vs_iph_icmp(iph))) return NF_DROP; /* * Notify the client that the destination is unreachable, and * release the socket buffer. * Since it is in IP layer, the TCP socket is not actually * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { if (!skb->dev) skb->dev = net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } #ifdef CONFIG_SYSCTL static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return ipvs->sysctl_snat_reroute; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return ipvs->sysctl_nat_icmp_send; } #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } #endif __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) { if (NF_INET_LOCAL_IN == hooknum) return IP_DEFRAG_VS_IN; if (NF_INET_FORWARD == hooknum) return IP_DEFRAG_VS_FWD; return IP_DEFRAG_VS_OUT; } static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(ipvs->net, skb, user); local_bh_enable(); if (!err) ip_send_check(ip_hdr(skb)); return err; } static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, unsigned int hooknum) { if (!sysctl_snat_reroute(ipvs)) return 0; /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ if (NF_INET_LOCAL_IN == hooknum) return 0; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct dst_entry *dst = skb_dst(skb); if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) return 1; } else #endif if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) return 1; return 0; } /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in */ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct iphdr *iph = ip_hdr(skb); unsigned int icmp_offset = iph->ihl*4; struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + icmp_offset); struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { iph->saddr = cp->vaddr.ip; ip_send_check(iph); ciph->daddr = cp->vaddr.ip; ip_send_check(ciph); } else { iph->daddr = cp->daddr.ip; ip_send_check(iph); ciph->saddr = cp->daddr.ip; ip_send_check(ciph); } /* the TCP/UDP/SCTP port */ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || IPPROTO_SCTP == ciph->protocol) { __be16 *ports = (void *)ciph + ciph->ihl*4; if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->checksum = 0; icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } #ifdef CONFIG_IP_VS_IPV6 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int inout) { struct ipv6hdr *iph = ipv6_hdr(skb); unsigned int icmp_offset = 0; unsigned int offs = 0; /* header offset*/ int protocol; struct icmp6hdr *icmph; struct ipv6hdr *ciph; unsigned short fragoffs; ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); offs = icmp_offset + sizeof(struct icmp6hdr); ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); if (inout) { iph->saddr = cp->vaddr.in6; ciph->daddr = cp->vaddr.in6; } else { iph->daddr = cp->daddr.in6; ciph->saddr = cp->daddr.in6; } /* the TCP/UDP/SCTP port */ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol)) { __be16 *ports = (void *)(skb_network_header(skb) + offs); IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, ntohs(inout ? ports[1] : ports[0]), ntohs(inout ? cp->vport : cp->dport)); if (inout) ports[1] = cp->vport; else ports[0] = cp->dport; } /* And finally the ICMP checksum */ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, skb->len - icmp_offset, IPPROTO_ICMPV6, 0); skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMPv6"); else IP_VS_DBG_PKT(11, AF_INET6, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMPv6"); } #endif /* Handle relevant response ICMP messages - forward to the right * destination host. */ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl, unsigned int hooknum) { unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", IP_VS_DBG_ADDR(af, snet)); goto out; } if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); else #endif ip_vs_nat_icmp(skb, pp, cp, 1); if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto out; after_nat: /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: __ip_vs_conn_put(cp); return verdict; } /* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl; union nf_inet_addr snet; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking outgoing ICMP for"); ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, pp, ciph.len, ihl, hooknum); } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *ipvsh) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; union nf_inet_addr snet; unsigned int offset; *related = 1; ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (ipvsh->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &ipvsh->saddr, &ipvsh->daddr); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), true, &ciph)) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(ciph.protocol); if (!pp) return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; snet.in6 = ciph.saddr.in6; offset = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, pp, offset, sizeof(struct ipv6hdr), hooknum); } #endif /* * Check if sctp chunc is ABORT chunk */ static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return 0; if (sch->type == SCTP_CID_ABORT) return 1; return 0; } static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; } static inline bool is_new_conn(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { switch (iph->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th == NULL) return false; return th->syn; } case IPPROTO_SCTP: { struct sctp_chunkhdr *sch, schunk; sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), sizeof(schunk), &schunk); if (sch == NULL) return false; return sch->type == SCTP_CID_INIT; } default: return false; } } static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, int conn_reuse_mode) { /* Controlled (FTP DATA or persistence)? */ if (cp->control) return false; switch (cp->protocol) { case IPPROTO_TCP: return (cp->state == IP_VS_TCP_S_TIME_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || ((conn_reuse_mode & 2) && (cp->state == IP_VS_TCP_S_FIN_WAIT) && (cp->flags & IP_VS_CONN_F_NOOUTPUT)); case IPPROTO_SCTP: return cp->state == IP_VS_SCTP_S_CLOSED; default: return false; } } /* Generic function to create new connections for outgoing RS packets * * Pre-requisites for successful connection creation: * 1) Virtual Service is NOT fwmark based: * In fwmark-VS actual vaddr and vport are unknown to IPVS * 2) Real Server and Virtual Service were NOT configured without port: * This is to allow match of different VS to the same RS ip-addr */ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport) { struct ip_vs_conn_param param; struct ip_vs_conn *ct = NULL, *cp = NULL; const union nf_inet_addr *vaddr, *daddr, *caddr; union nf_inet_addr snet; __be16 vport; unsigned int flags; vaddr = &svc->addr; vport = svc->port; daddr = &iph->saddr; caddr = &iph->daddr; /* check pre-requisites are satisfied */ if (svc->fwmark) return NULL; if (!vport || !dport) return NULL; /* for persistent service first create connection template */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) { /* apply netmask the same way ingress-side does */ #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) ipv6_addr_prefix(&snet.in6, &caddr->in6, (__force __u32)svc->netmask); else #endif snet.ip = caddr->ip & svc->netmask; /* fill params and create template if not existent */ if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, &snet, 0, vaddr, vport, &param) < 0) return NULL; ct = ip_vs_ct_in_get(&param); /* check if template exists and points to the same dest */ if (!ct || !ip_vs_check_template(ct, dest)) { ct = ip_vs_conn_new(&param, dest->af, daddr, dport, IP_VS_CONN_F_TEMPLATE, dest, 0); if (!ct) { kfree(param.pe_data); return NULL; } ct->timeout = svc->timeout; } else { kfree(param.pe_data); } } /* connection flags */ flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* create connection */ ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &param); cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0); if (!cp) { if (ct) ip_vs_conn_put(ct); return NULL; } if (ct) { ip_vs_control_add(cp, ct); ip_vs_conn_put(ct); } ip_vs_conn_stats(cp, svc); /* return connection (will be used to handle outgoing packet) */ IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), cp->flags, refcount_read(&cp->refcnt)); return cp; } /* Handle outgoing packets which are considered requests initiated by * real servers, so that subsequent responses from external client can be * routed to the right real server. * Used also for outgoing responses in OPS mode. * * Connection management is handled by persistent-engine specific callback. */ static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, struct netns_ipvs *ipvs, int af, struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_conn *cp = NULL; __be16 _ports[2], *pptr; if (hooknum == NF_INET_LOCAL_IN) return NULL; pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); if (!pptr) return NULL; dest = ip_vs_find_real_service(ipvs, af, iph->protocol, &iph->saddr, pptr[0]); if (dest) { struct ip_vs_service *svc; struct ip_vs_pe *pe; svc = rcu_dereference(dest->svc); if (svc) { pe = rcu_dereference(svc->pe); if (pe && pe->conn_out) cp = pe->conn_out(svc, dest, skb, iph, pptr[0], pptr[1]); } } return cp; } /* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto after_nat; IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ if (pp->snat_handler && !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ipv6_hdr(skb)->saddr = cp->vaddr.in6; else #endif { ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); } /* * nf_iterate does not expect change in the skb->dst->dev. * It looks like it is not fatal to enable this code for hooks * where our handlers are at the end of the chain list and * when all next handlers use skb->dst->dev and not outdev. * It will definitely route properly the inout NAT traffic * when multiple paths are used. */ /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); after_nat: ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); return NF_STOLEN; } /* * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int af = state->pf; struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } if (unlikely(!skb_dst(skb))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); } /* * Check if the packet belongs to an existing entry */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { /* Currently only for UDP: * connection oriented protocols typically use * ephemeral ports for outgoing connections, so * related incoming responses would not match any VS */ if (pp->protocol == IPPROTO_UDP) { cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); } } if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if ((iph.protocol != IPPROTO_TCP && iph.protocol != IPPROTO_SCTP) || ((iph.protocol == IPPROTO_TCP && !is_tcp_reset(skb, iph.len)) || (iph.protocol == IPPROTO_SCTP && !is_sctp_abort(skb, iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } } } IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; if (!iph->fragoffs) { /* No (second) fragments need to enter here, as nf_defrag_ipv6 * replayed fragment zero will already have created the cp */ /* Schedule and create new connection entry into cpp */ if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) return 0; } if (unlikely(!*cpp)) { /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, iph->off, "ip_vs_in: packet continues traversal as normal"); /* Fragment couldn't be mapped to a conn entry */ if (iph->fragoffs) IP_VS_DBG_PKT(7, af, pp, skb, iph->off, "unhandled fragment"); *verdict = NF_ACCEPT; return 0; } return 1; } /* Check the UDP tunnel and return its header length */ static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct udphdr _udph, *udph; struct ip_vs_dest *dest; udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (!udph) goto unk; offset += sizeof(struct udphdr); dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { struct guehdr _gueh, *gueh; gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); if (!gueh) goto unk; if (gueh->control != 0 || gueh->version != 0) goto unk; /* Later we can support also IPPROTO_IPV6 */ if (gueh->proto_ctype != IPPROTO_IPIP) goto unk; *proto = gueh->proto_ctype; return sizeof(struct udphdr) + sizeof(struct guehdr) + (gueh->hlen << 2); } unk: return 0; } /* Check the GRE tunnel and return its header length */ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, unsigned int offset, __u16 af, const union nf_inet_addr *daddr, __u8 *proto) { struct gre_base_hdr _greh, *greh; struct ip_vs_dest *dest; greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); if (!greh) goto unk; dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ if ((greh->flags & ~GRE_CSUM) != 0) goto unk; type = greh->protocol; /* Later we can support also IPPROTO_IPV6 */ if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; gre_flags_to_tnl_flags(flags, greh->flags); return gre_calc_hlen(flags); } unk: return 0; } /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. */ static int ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool tunnel, new_cp = false; union nf_inet_addr *raddr; char *outer_proto = "IPIP"; *related = 1; /* reassemble IP fragments */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", ic->type, ntohs(icmp_id(ic)), &iph->saddr, &iph->daddr); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP/UDP/GRE tunnel packets */ tunnel = false; if (cih->protocol == IPPROTO_IPIP) { struct ip_vs_dest *dest; if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); /* Only for known tunnel */ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ tunnel = true; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { __u8 iproto; int ulen; /* Non-first fragment has no UDP/GRE header */ if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; if (cih->protocol == IPPROTO_UDP) { ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "UDP"; } else { ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); outer_proto = "GRE"; } if (ulen > 0) { /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; /* Now we should be at the original IP header */ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih && cih->version == 4 && cih->ihl >= 5 && iproto == IPPROTO_IPIP) tunnel = true; else return NF_ACCEPT; } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, "Checking incoming ICMP for"); offset2 = offset; ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); offset = ciph.len; /* The embedded headers contain source and dest in reverse order. * For IPIP/UDP/GRE tunnel this is error for request, not for reply. */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET, skb, &ciph); if (!cp) { int v; if (tunnel || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } verdict = NF_DROP; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", &iph->saddr); goto out; } if (tunnel) { __be32 info = ic->un.gateway; __u8 type = ic->type; __u8 code = ic->code; /* Update the MTU */ if (ic->type == ICMP_DEST_UNREACH && ic->code == ICMP_FRAG_NEEDED) { struct ip_vs_dest *dest = cp->dest; u32 mtu = ntohs(ic->un.frag.mtu); __be16 frag_off = cih->frag_off; /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) goto ignore_tunnel; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", outer_proto, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) goto ignore_tunnel; /* Prefer the resulting PMTU */ if (dest) { struct ip_vs_dest_dst *dest_dst; dest_dst = rcu_dereference(dest->dest_dst); if (dest_dst) mtu = dst_mtu(dest_dst->dst_cache); } if (mtu > 68 + sizeof(struct iphdr)) mtu -= sizeof(struct iphdr); info = htonl(mtu); } /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of * original request. */ if (pskb_pull(skb, offset2) == NULL) goto ignore_tunnel; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, type, code, ntohl(info)); icmp_send(skb, type, code, info); /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); ignore_tunnel: consume_skb(skb); verdict = NF_STOLEN; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || IPPROTO_SCTP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #ifdef CONFIG_IP_VS_IPV6 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int hooknum, struct ip_vs_iphdr *iph) { struct icmp6hdr _icmph, *ic; struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, verdict; bool new_cp = false; *related = 1; ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { *related = 0; return NF_ACCEPT; } /* Fragment header that is before ICMP header tells us that: * it's not an error message since they can't be fragmented. */ if (iph->flags & IP6_FH_F_FRAG) return NF_DROP; IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", ic->icmp6_type, ntohs(icmpv6_id(ic)), &iph->saddr, &iph->daddr); offset = iph->len + sizeof(_icmph); if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) return NF_ACCEPT; pd = ip_vs_proto_data_get(ipvs, ciph.protocol); if (!pd) return NF_ACCEPT; pp = pd->pp; /* Cannot handle fragmented embedded protocol */ if (ciph.fragoffs) return NF_ACCEPT; IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, "Checking incoming ICMPv6 for"); /* The embedded headers contain source and dest in reverse order * if not from localhost */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET6, skb, &ciph); if (!cp) { int v; if (!sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) return v; new_cp = true; } /* VS/TUN, VS/DR and LOCALNODE just let it go */ if ((hooknum == NF_INET_LOCAL_OUT) && (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { verdict = NF_ACCEPT; goto out; } /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); /* Need to mangle contained IPv6 header in ICMPv6 packet */ offset = ciph.len; if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || IPPROTO_SCTP == ciph.protocol) offset += 2 * sizeof(__u16); /* Also mangle ports */ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); out: if (likely(!new_cp)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); return verdict; } #endif /* * Check if it's for virtual services, look it up, * and send it on its way... */ static unsigned int ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; struct sock *sk; int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; /* * Big tappo: * - remote client: only PACKET_HOST * - route: used for struct net when skb->dev is unset */ if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ sk = skb_to_full_sk(skb); if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_in_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; } /* Protocol supported? */ pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) { /* The only way we'll see this packet again is if it's * encapsulated, so mark it with ipvs_property=1 so we * skip it if we're ignoring tunneled packets */ if (sysctl_ignore_tunneled(ipvs)) skb->ipvs_property = 1; return NF_ACCEPT; } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (conn_reuse_mode && is_new_conn_expected(cp, conn_reuse_mode)) { old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { /* Do not reschedule controlling connection * that uses conntrack while it is still * referenced by controlled connection(s). */ resched = !old_ct; } } if (resched) { if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } } /* Check the server status */ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ if (sysctl_expire_nodest_conn(ipvs)) { bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!old_ct) cp->flags &= ~IP_VS_CONN_F_NFCT; ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); if (old_ct) return NF_DROP; cp = NULL; } else { __ip_vs_conn_put(cp); return NF_DROP; } } if (unlikely(!cp)) { int v; if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) return v; } IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); ret = NF_ACCEPT; } /* Increase its packet counter and check if it is needed * to be synchronized * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout * * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) /* increment is done inside ip_vs_sync_conn too */ atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; } /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent * cache cluster, TCP packets can be marked and routed to ip_vs_in, * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain * and send them to ip_vs_in_icmp. */ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct netns_ipvs *ipvs = net_ipvs(state->net); int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 } else { struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); if (iphdr.protocol != IPPROTO_ICMPV6) return NF_ACCEPT; return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif } return ip_vs_in_icmp(ipvs, skb, &r, state->hook); } static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #ifdef CONFIG_IP_VS_IPV6 static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, }, }; #endif int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; int ret = 0; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return -EINVAL; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (!(ipvs->hooks_afmask & afmask)) { ret = nf_register_net_hooks(ipvs->net, ops, count); if (ret >= 0) ipvs->hooks_afmask |= afmask; } return ret; } void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) { const struct nf_hook_ops *ops; unsigned int count; unsigned int afmask; if (af == AF_INET6) { #ifdef CONFIG_IP_VS_IPV6 ops = ip_vs_ops6; count = ARRAY_SIZE(ip_vs_ops6); afmask = 2; #else return; #endif } else { ops = ip_vs_ops4; count = ARRAY_SIZE(ip_vs_ops4); afmask = 1; } if (ipvs->hooks_afmask & afmask) { nf_unregister_net_hooks(ipvs->net, ops, count); ipvs->hooks_afmask &= ~afmask; } } /* * Initialize IP Virtual Server netns mem. */ static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) return -ENOMEM; /* Hold the beast until a service is registered */ WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; if (ip_vs_estimator_net_init(ipvs) < 0) goto estimator_fail; if (ip_vs_control_net_init(ipvs) < 0) goto control_fail; if (ip_vs_protocol_net_init(ipvs) < 0) goto protocol_fail; if (ip_vs_app_net_init(ipvs) < 0) goto app_fail; if (ip_vs_conn_net_init(ipvs) < 0) goto conn_fail; if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; return 0; /* * Error handling */ sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: ip_vs_app_net_cleanup(ipvs); app_fail: ip_vs_protocol_net_cleanup(ipvs); protocol_fail: ip_vs_control_net_cleanup(ipvs); control_fail: ip_vs_estimator_net_cleanup(ipvs); estimator_fail: net->ipvs = NULL; return -ENOMEM; } static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); ip_vs_protocol_net_cleanup(ipvs); ip_vs_control_net_cleanup(ipvs); ip_vs_estimator_net_cleanup(ipvs); IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); net->ipvs = NULL; } } static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .exit_batch = __ip_vs_dev_cleanup_batch, }; /* * Initialize IP Virtual Server */ static int __init ip_vs_init(void) { int ret; ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); goto exit; } ip_vs_protocol_init(); ret = ip_vs_conn_init(); if (ret < 0) { pr_err("can't setup connection table.\n"); goto cleanup_protocol; } ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ if (ret < 0) goto cleanup_conn; ret = register_pernet_device(&ipvs_core_dev_ops); if (ret < 0) goto cleanup_sub; ret = ip_vs_register_nl_ioctl(); if (ret < 0) { pr_err("can't register netlink/ioctl.\n"); goto cleanup_dev; } pr_info("ipvs loaded.\n"); return ret; cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: unregister_pernet_subsys(&ipvs_core_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); exit: return ret; } static void __exit ip_vs_cleanup(void) { ip_vs_unregister_nl_ioctl(); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); /* common rcu_barrier() used by: * - ip_vs_control_cleanup() */ rcu_barrier(); pr_info("ipvs unloaded.\n"); } module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Virtual Server");
48 48 25 19 9 36 36 33 3 19 36 17 26 6 6 36 3 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include <linux/etherdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> #include <linux/module.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 /** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ int ovs_vport_init(void) { dev_table = kzalloc_objs(struct hlist_head, VPORT_HASH_BUCKETS); if (!dev_table) return -ENOMEM; return 0; } /** * ovs_vport_exit - shutdown vport subsystem * * Called at module exit time to shutdown the vport subsystem. */ void ovs_vport_exit(void) { kfree(dev_table); } static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; ovs_lock(); list_for_each_entry(o, &vport_ops_list, list) if (ops->type == o->type) goto errout; list_add_tail(&ops->list, &vport_ops_list); err = 0; errout: ovs_unlock(); return err; } EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { ovs_lock(); list_del(&ops->list); ovs_unlock(); } EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. */ struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node, lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; return NULL; } /** * ovs_vport_alloc - allocate and initialize new vport * * @priv_size: Size of private data area to allocate. * @ops: vport device ops * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using * vport_priv(). Some parameters of the vport will be initialized from @parms. * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; int err; alloc_size = sizeof(struct vport); if (priv_size) { alloc_size = ALIGN(alloc_size, VPORT_ALIGN); alloc_size += priv_size; } vport = kzalloc(alloc_size, GFP_KERNEL); if (!vport) return ERR_PTR(-ENOMEM); vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; goto err_kfree_vport; } vport->dp = parms->dp; vport->port_no = parms->port_no; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { err = -EINVAL; goto err_free_percpu; } return vport; err_free_percpu: free_percpu(vport->upcall_stats); err_kfree_vport: kfree(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport * * @vport: vport to free * * Frees a vport allocated with vport_alloc() when it is no longer needed. * * The caller must ensure that an RCU grace period has passed since the last * time @vport was in a datapath. */ void ovs_vport_free(struct vport *vport) { /* vport is freed from RCU callback or error path, Therefore * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->upcall_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) { struct vport_ops *ops; list_for_each_entry(ops, &vport_ops_list, list) if (ops->type == parms->type) return ops; return NULL; } /** * ovs_vport_add - add vport device (for kernel callers) * * @parms: Information about new vport. * * Creates a new vport with the specified configuration (which is dependent on * device type). ovs_mutex must be held. */ struct vport *ovs_vport_add(const struct vport_parms *parms) { struct vport_ops *ops; struct vport *vport; ops = ovs_vport_lookup(parms); if (ops) { struct hlist_head *bucket; if (!try_module_get(ops->owner)) return ERR_PTR(-EAFNOSUPPORT); vport = ops->create(parms); if (IS_ERR(vport)) { module_put(ops->owner); return vport; } bucket = hash_bucket(ovs_dp_get_net(vport->dp), ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } /* Unlock to attempt module load and return -EAGAIN if load * was successful as we need to restart the port addition * workflow. */ ovs_unlock(); request_module("vport-type-%d", parms->type); ovs_lock(); if (!ovs_vport_lookup(parms)) return ERR_PTR(-EAFNOSUPPORT); else return ERR_PTR(-EAGAIN); } /** * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. */ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) { if (!vport->ops->set_options) return -EOPNOTSUPP; return vport->ops->set_options(vport, options); } /** * ovs_vport_del - delete existing vport device * * @vport: vport to delete. * * Detaches @vport from its datapath and destroys it. ovs_mutex must * be held. */ void ovs_vport_del(struct vport *vport) { hlist_del_rcu(&vport->hash_node); module_put(vport->ops->owner); vport->ops->destroy(vport); } /** * ovs_vport_get_stats - retrieve device stats * * @vport: vport from which to retrieve the stats * @stats: location to store stats * * Retrieves transmit, receive, and error stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { const struct rtnl_link_stats64 *dev_stats; struct rtnl_link_stats64 temp; dev_stats = dev_get_stats(vport->dev, &temp); stats->rx_errors = dev_stats->rx_errors; stats->tx_errors = dev_stats->tx_errors; stats->tx_dropped = dev_stats->tx_dropped; stats->rx_dropped = dev_stats->rx_dropped; stats->rx_bytes = dev_stats->rx_bytes; stats->rx_packets = dev_stats->rx_packets; stats->tx_bytes = dev_stats->tx_bytes; stats->tx_packets = dev_stats->tx_packets; } /** * ovs_vport_get_upcall_stats - retrieve upcall stats * * @vport: vport from which to retrieve the stats. * @skb: sk_buff where upcall stats should be appended. * * Retrieves upcall stats for the given device. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) { u64 tx_success = 0, tx_fail = 0; struct nlattr *nla; int i; for_each_possible_cpu(i) { const struct vport_upcall_stats_percpu *stats; u64 n_success, n_fail; unsigned int start; stats = per_cpu_ptr(vport->upcall_stats, i); do { start = u64_stats_fetch_begin(&stats->syncp); n_success = u64_stats_read(&stats->n_success); n_fail = u64_stats_read(&stats->n_fail); } while (u64_stats_fetch_retry(&stats->syncp, start)); tx_success += n_success; tx_fail += n_fail; } nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); if (!nla) return -EMSGSIZE; if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, OVS_VPORT_ATTR_PAD)) { nla_nest_cancel(skb, nla); return -EMSGSIZE; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_get_options - retrieve device options * * @vport: vport from which to retrieve the options. * @skb: sk_buff where options should be appended. * * Retrieves the configuration of the given device, appending an * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested * vport-specific attributes to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another * negative error code if a real error occurred. If an error occurs, @skb is * left unmodified. * * Must be called with ovs_mutex or rcu_read_lock. */ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) { struct nlattr *nla; int err; if (!vport->ops->get_options) return 0; nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; err = vport->ops->get_options(vport, skb); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } /** * ovs_vport_set_upcall_portids - set upcall portids of @vport. * * @vport: vport to modify. * @ids: new configuration, an array of port ids. * * Sets the vport's upcall_portids to @ids. * * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed * as an array of U32. * * Must be called with ovs_mutex. */ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { struct vport_portids *old, *vport_portids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(vport->upcall_portids); vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), GFP_KERNEL); if (!vport_portids) return -ENOMEM; vport_portids->n_ids = nla_len(ids) / sizeof(u32); vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); nla_memcpy(vport_portids->ids, ids, nla_len(ids)); rcu_assign_pointer(vport->upcall_portids, vport_portids); if (old) kfree_rcu(old, rcu); return 0; } /** * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. * * @vport: vport from which to retrieve the portids. * @skb: sk_buff where portids should be appended. * * Retrieves the configuration of the given vport, appending the * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall * portids to @skb. * * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. * If an error occurs, @skb is left unmodified. Must be called with * ovs_mutex or rcu_read_lock. */ int ovs_vport_get_upcall_portids(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; ids = rcu_dereference_ovsl(vport->upcall_portids); if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->n_ids * sizeof(u32), (void *)ids->ids); else return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } /** * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. * * @vport: vport from which the missed packet is received. * @skb: skb that the missed packet was received. * * Uses the skb_get_hash() to select the upcall portid to send the * upcall. * * Returns the portid of the target socket. Must be called with rcu_read_lock. */ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; u32 ids_index; u32 hash; ids = rcu_dereference(vport->upcall_portids); /* If there is only one portid, select it in the fast-path. */ if (ids->n_ids == 1) return ids->ids[0]; hash = skb_get_hash(skb); ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); return ids->ids[ids_index]; } /** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet * @skb: skb that was received * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, const struct ip_tunnel_info *tun_info) { struct sw_flow_key key; int error; OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; OVS_CB(skb)->upcall_pid = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; mark = skb->mark; skb_scrub_packet(skb, true); skb->mark = mark; tun_info = NULL; } /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return error; } ovs_dp_process_packet(skb, &key); return 0; } static int packet_length(const struct sk_buff *skb, struct net_device *dev) { int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) length -= VLAN_HLEN; /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none * account for 802.1ad. e.g. is_skb_forwardable(). */ return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { int mtu = vport->dev->mtu; switch (vport->dev->type) { case ARPHRD_NONE: if (mac_proto == MAC_PROTO_ETHERNET) { skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = htons(ETH_P_TEB); } else if (mac_proto != MAC_PROTO_NONE) { WARN_ON_ONCE(1); goto drop; } break; case ARPHRD_ETHER: if (mac_proto != MAC_PROTO_ETHERNET) goto drop; break; default: goto drop; } if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { vport->dev->stats.tx_errors++; if (vport->dev->flags & IFF_UP) net_warn_ratelimited("%s: dropped over-mtu packet: " "%d > %d\n", vport->dev->name, packet_length(skb, vport->dev), mtu); goto drop; } skb->dev = vport->dev; skb_clear_tstamp(skb); vport->ops->send(skb); return; drop: kfree_skb(skb); }
167 287 136 231 232 5 63 126 13 18 72 38 19 18 2 18 1 4 116 2 113 116 65 84 23 56 1 8 8 27 6 21 8 3 8 7 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr);
13244 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/platform_data/dsa.h> #include <linux/phylink.h> #include <net/devlink.h> #include <net/switchdev.h> struct dsa_8021q_context; struct tc_action; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 #define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 #define DSA_TAG_PROTO_DSA_VALUE 3 #define DSA_TAG_PROTO_EDSA_VALUE 4 #define DSA_TAG_PROTO_GSWIP_VALUE 5 #define DSA_TAG_PROTO_KSZ9477_VALUE 6 #define DSA_TAG_PROTO_KSZ9893_VALUE 7 #define DSA_TAG_PROTO_LAN9303_VALUE 8 #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 #define DSA_TAG_PROTO_RTL8_4T_VALUE 25 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 #define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 #define DSA_TAG_PROTO_YT921X_VALUE 30 #define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31 #define DSA_TAG_PROTO_MXL862_VALUE 32 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE, DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE, DSA_TAG_PROTO_MXL862 = DSA_TAG_PROTO_MXL862_VALUE, }; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); int (*connect)(struct dsa_switch *ds); void (*disconnect)(struct dsa_switch *ds); unsigned int needed_headroom; unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ bool promisc_on_conduit; }; struct dsa_lag { struct net_device *dev; unsigned int id; struct mutex fdb_lock; struct list_head fdbs; refcount_t refcount; }; struct dsa_switch_tree { struct list_head list; /* List of switch ports */ struct list_head ports; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; /* Default tagging protocol preferred by the switches in this * tree. */ enum dsa_tag_protocol default_proto; /* Has this tree been applied to the hardware? */ bool setup; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* List of DSA links composing the routing table */ struct list_head rtable; /* Length of "lags" array */ unsigned int lags_len; /* Track the largest switch index within a tree */ unsigned int last_switch; }; /* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { struct dsa_lag *lag = dsa_lag_by_id(dst, id); if (lag->dev == lag_dev) return lag->id; } return -ENODEV; } /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, DSA_PORT_MALL_POLICER, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; struct flow_action_police policer; }; }; struct dsa_bridge { struct net_device *dev; unsigned int num; bool tx_fwd_offload; refcount_t refcount; }; struct dsa_port { /* A CPU port is physically connected to a conduit device. A user port * exposes a network device to user-space, called 'user' here. */ union { struct net_device *conduit; struct net_device *user; }; /* Copy of the tagging protocol operations, for quicker access * in the data path. Valid only for the CPU ports. */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); struct dsa_switch *ds; unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, DSA_PORT_TYPE_CPU, DSA_PORT_TYPE_DSA, DSA_PORT_TYPE_USER, } type; const char *name; struct dsa_port *cpu_dp; u8 mac[ETH_ALEN]; u8 stp_state; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u8 vlan_filtering:1; /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ u8 learning:1; u8 lag_tx_enabled:1; /* conduit state bits, valid only on CPU ports */ u8 conduit_admin_up:1; u8 conduit_oper_up:1; /* Valid only on user ports */ u8 cpu_port_in_lag:1; u8 setup:1; struct device_node *dn; unsigned int ageing_time; struct dsa_bridge *bridge; struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; netdevice_tracker conduit_tracker; struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; /* * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; struct mutex vlans_lock; union { /* List of VLANs that CPU and DSA ports are members of. * Access to this is serialized by the sleepable @vlans_lock. */ struct list_head vlans; /* List of VLANs that user ports are members of. * Access to this is serialized by netif_addr_lock_bh(). */ struct list_head user_vlans; }; }; static inline struct dsa_port * dsa_phylink_to_port(struct phylink_config *config) { return container_of(config, struct dsa_port, pl_config); } /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, * so keep it stupid at the moment and list them all. */ struct dsa_link { struct dsa_port *dp; struct dsa_port *link_dp; struct list_head list; }; enum dsa_db_type { DSA_DB_PORT, DSA_DB_LAG, DSA_DB_BRIDGE, }; struct dsa_db { enum dsa_db_type type; union { const struct dsa_port *dp; struct dsa_lag lag; struct dsa_bridge bridge; }; }; struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; struct dsa_db db; }; struct dsa_vlan { u16 vid; refcount_t refcount; struct list_head list; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; unsigned int index; /* Warning: the following bit fields are not atomic, and updating them * can only be done from code paths where concurrency is not possible * (probe time or under rtnl_lock). */ u32 setup:1; /* Disallow bridge core from requesting different VLAN awareness * settings on ports if not hardware-supported */ u32 vlan_filtering_is_global:1; /* Keep VLAN filtering enabled on ports not offloading any upper */ u32 needs_standalone_vlan_filtering:1; /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges * that have vlan_filtering=0. All drivers should ideally set this (and * then the option would get removed), but it is unknown whether this * would break things or not. */ u32 configure_vlan_while_not_filtering:1; /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. * DEPRECATED: Do NOT set this field in new drivers. Instead look at * the dsa_software_vlan_untag() comments. */ u32 untag_bridge_pvid:1; /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. * Useful if the switch cannot preserve the VLAN tag as seen on the * wire for user port ingress, and chooses to send all frames as * VLAN-tagged to the CPU, including those which were originally * untagged. */ u32 untag_vlan_aware_bridge_pvid:1; /* Let DSA manage the FDB entries towards the * CPU, based on the software bridge database. */ u32 assisted_learning_on_cpu_port:1; /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ u32 vlan_filtering:1; /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. */ u32 mtu_enforcement_ingress:1; /* Drivers that isolate the FDBs of multiple bridges must set this * to true to receive the bridge as an argument in .port_fdb_{add,del} * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be * passed as zero. */ u32 fdb_isolation:1; /* Drivers that have global DSCP mapping settings must set this to * true to automatically apply the settings to all ports. */ u32 dscp_prio_mapping_is_global:1; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; void *tagger_data; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * Allow a DSA switch driver to override the phylink MAC ops */ const struct phylink_mac_ops *phylink_mac_ops; /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* Storage for drivers using tag_8021q */ struct dsa_8021q_context *tag_8021q_ctx; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Drivers that benefit from having an ID associated with each * offloaded LAG should set this to the maximum number of * supported IDs. DSA will then maintain a mapping of _at * least_ these many IDs, accessible to drivers via * dsa_lag_id(). */ unsigned int num_lag_ids; /* Drivers that support bridge forwarding offload or FDB isolation * should set this to the maximum number of bridges spanning the same * switch tree (or all trees, in the case of cross-tree bridging * support) that can be offloaded. */ unsigned int max_num_bridges; unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) return dp; return NULL; } static inline bool dsa_port_is_dsa(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_port_is_cpu(struct dsa_port *port) { return port->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_port_is_user(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_USER; } static inline bool dsa_port_is_unused(struct dsa_port *dp) { return dp->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) { return dsa_port_is_cpu(dp) && dp->conduit_admin_up && dp->conduit_oper_up; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } #define dsa_tree_for_each_user_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) #define dsa_switch_for_each_available_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (!dsa_port_is_unused((_dp))) #define dsa_switch_for_each_user_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) #define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; u32 mask = 0; dsa_switch_for_each_user_port(dp, ds) mask |= BIT(dp->index); return mask; } static inline u32 dsa_cpu_ports(struct dsa_switch *ds) { struct dsa_port *cpu_dp; u32 mask = 0; dsa_switch_for_each_cpu_port(cpu_dp, ds) mask |= BIT(cpu_dp->index); return mask; } /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { struct dsa_switch_tree *dst = ds->dst; struct dsa_link *dl; list_for_each_entry(dl, &dst->rtable, list) if (dl->dp->ds == ds && dl->link_dp->ds->index == device) return dl->dp->index; return ds->num_ports; } /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) { if (device == ds->index) return port; else return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { const struct dsa_port *dp = dsa_to_port(ds, port); const struct dsa_port *cpu_dp = dp->cpu_dp; if (!cpu_dp) return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } /* Return true if this is the local port used to reach the CPU port */ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) { if (dsa_is_unused_port(ds, port)) return false; return port == dsa_upstream_port(ds, port); } /* Return true if this is a DSA port leading away from the CPU */ static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) { return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); } /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { struct dsa_port *dp; dsa_switch_for_each_available_port(dp, ds) { return dsa_upstream_port(ds, dp->index); } return ds->num_ports; } /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. */ static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, struct dsa_switch *downstream_ds) { int routing_port; if (upstream_ds == downstream_ds) return true; routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); return dsa_is_upstream_port(downstream_ds, routing_port); } static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; if (ds->vlan_filtering_is_global) return ds->vlan_filtering; else return dp->vlan_filtering; } static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) { return dp->lag ? dp->lag->id : 0; } static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) { return dp->lag ? dp->lag->dev : NULL; } static inline bool dsa_port_offloads_lag(struct dsa_port *dp, const struct dsa_lag *lag) { return dsa_port_lag_dev_get(dp) == lag->dev; } static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) { if (dp->cpu_port_in_lag) return dsa_port_lag_dev_get(dp->cpu_dp); return dp->cpu_dp->conduit; } static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; if (dp->lag) return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; return dp->user; } static inline struct net_device * dsa_port_bridge_dev_get(const struct dsa_port *dp) { return dp->bridge ? dp->bridge->dev : NULL; } static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) { return dp->bridge ? dp->bridge->num : 0; } static inline bool dsa_port_bridge_same(const struct dsa_port *a, const struct dsa_port *b) { struct net_device *br_a = dsa_port_bridge_dev_get(a); struct net_device *br_b = dsa_port_bridge_dev_get(b); /* Standalone ports are not in the same bridge with one another */ return (!br_a || !br_b) ? false : (br_a == br_b); } static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge_dev(struct dsa_port *dp, const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. */ return dsa_port_bridge_dev_get(dp) == bridge_dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, const struct dsa_bridge *bridge) { return dsa_port_bridge_dev_get(dp) == bridge->dev; } /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, const struct net_device *dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_port(dp, dev)) return true; return false; } /* Returns true if any port of this tree offloads the given bridge */ static inline bool dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, const struct net_device *bridge_dev) { struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) return true; return false; } static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { return a->ds->dst == b->ds->dst; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Tagging protocol helpers called for the CPU ports and DSA links. * @get_tag_protocol retrieves the initial tagging protocol and is * mandatory. Switches which can operate using multiple tagging * protocols should implement @change_tag_protocol and report in * @get_tag_protocol the tagger in current use. */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver * in current use. The switch driver can provide handlers for certain * types of packets for switch management. */ int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); int (*port_change_conduit)(struct dsa_switch *ds, int port, struct net_device *conduit, struct netlink_ext_ack *extack); /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); /* Per-port initialization and destruction methods. Mandatory if the * driver registers devlink port regions, optional otherwise. */ int (*port_setup)(struct dsa_switch *ds, int port); void (*port_teardown)(struct dsa_switch *ds, int port); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats); void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); void (*get_ts_stats)(struct dsa_switch *ds, int port, struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, struct ethtool_pause_stats *pause_stats); void (*self_test)(struct dsa_switch *ds, int port, struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); /* * ethtool MAC merge layer */ int (*get_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_state *state); int (*set_mm)(struct dsa_switch *ds, int port, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack); void (*get_mm_stats)(struct dsa_switch *ds, int port, struct ethtool_mm_stats *stats); /* * DCB ops */ int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); int (*port_set_apptrust)(struct dsa_switch *ds, int port, const u8 *sel, int nsel); int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, int *nsel); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); /* * Notification for MAC address changes on user ports. Drivers can * currently only veto operations. They should not use the method to * program the hardware, since the operation is not rolled back in case * of other errors. */ int (*port_set_mac_address)(struct dsa_switch *ds, int port, const unsigned char *addr); /* * Compatibility between device trees defining multiple CPU ports and * drivers which are not OK to use by default the numerically smallest * CPU port of a switch for its local ports. This can return NULL, * meaning "don't know/don't care". */ struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); /* * Port's MAC EEE settings */ bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Upper device tracking. */ int (*port_prechangeupper)(struct dsa_switch *ds, int port, struct netdev_notifier_changeupper_info *info); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); int (*port_mst_state_set)(struct dsa_switch *ds, int port, const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); void (*port_set_host_flood)(struct dsa_switch *ds, int port, bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, const struct switchdev_vlan_msti *msti); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, const unsigned char *addr, u16 vid, struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*cls_flower_add)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_del)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*cls_flower_stats)(struct dsa_switch *ds, int port, struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, const struct flow_action_police *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge, struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); int (*devlink_sb_pool_get)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, struct netlink_ext_ack *extack); int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 *p_pool_index, u32 *p_threshold); int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u16 pool_index, u32 threshold, struct netlink_ext_ack *extack); int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, unsigned int sb_index); int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 pool_index, u32 *p_cur, u32 *p_max); int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through * this method. By MTU, one understands the SDU (L2 payload) length. * If the switch needs to account for the DSA tag on the CPU port, this * method needs to do so privately. */ int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); /* * LAG integration */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, struct netdev_lag_upper_info *info, struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); /* * HSR integration */ int (*port_hsr_join)(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); /* * MRP integration */ int (*port_mrp_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_mrp *mrp); int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, const struct switchdev_obj_ring_role_mrp *mrp); /* * tag_8021q operations */ int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); /* * DSA conduit tracking operations */ void (*conduit_state_change)(struct dsa_switch *ds, const struct net_device *conduit, bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); int dsa_devlink_resource_register(struct dsa_switch *ds, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); void dsa_devlink_resources_unregister(struct dsa_switch *ds); void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); struct devlink_region * dsa_devlink_port_region_create(struct dsa_switch *ds, int port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { struct dsa_switch *ds; }; static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) { struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) { struct devlink *dl = port->devlink; struct dsa_devlink_priv *dl_priv = devlink_priv(dl); return dl_priv->ds; } static inline int dsa_devlink_port_to_port(struct devlink_port *port) { return port->index; } bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, struct net_device *hsr, struct netlink_ext_ack *extack); int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, struct net_device *hsr); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. * * We need to return: * - offset: the (B - A) difference between: * A. the position of the real EtherType and * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes * after the normal EtherType was supposed to be) * The offset in bytes is exactly equal to the tagger overhead (and half of * that, in __be16 shorts). * * - proto: the value of the real EtherType. */ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #if IS_ENABLED(CONFIG_NET_DSA) bool dsa_user_dev_check(const struct net_device *dev); #else static inline bool dsa_user_dev_check(const struct net_device *dev) { return false; } #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif
12 12 341 32 11 13 10 13 13 394 335 1 1 13 13 13 13 13 1 1 396 395 1 8 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 // SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * * Used to allow a cgroup hierarchy to stop any new processes from fork()ing * after a certain limit is reached. * * Since it is trivial to hit the task limit without hitting any kmemcg limits * in place, PIDs are a fundamental resource. As such, PID exhaustion must be * preventable in the scope of a cgroup hierarchy by allowing resource limiting * of the number of tasks in a cgroup. * * In order to use the `pids` controller, set the maximum number of tasks in * pids.max (this is not available in the root cgroup for obvious reasons). The * number of processes currently in the cgroup is given by pids.current. * Organisational operations are not blocked by cgroup policies, so it is * possible to have pids.current > pids.max. However, it is not possible to * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking * would cause a cgroup policy to be violated. * * To set a cgroup to have no limit, set pids.max to "max". This is the default * for all new cgroups (N.B. that PID limits are hierarchical, so the most * stringent limit in the hierarchy is followed). * * pids.current tracks all child cgroup hierarchies, so parent/pids.current is * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cgroup.h> #include <linux/slab.h> #include <linux/sched/task.h> #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" enum pidcg_event { /* Fork failed in subtree because this pids_cgroup limit was hit. */ PIDCG_MAX, /* Fork failed in this pids_cgroup because ancestor limit was hit. */ PIDCG_FORKFAIL, NR_PIDCG_EVENTS, }; struct pids_cgroup { struct cgroup_subsys_state css; /* * Use 64-bit types so that we can safely represent "max" as * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; atomic64_t limit; int64_t watermark; /* Handles for pids.events[.local] */ struct cgroup_file events_file; struct cgroup_file events_local_file; atomic64_t events[NR_PIDCG_EVENTS]; atomic64_t events_local[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) { return container_of(css, struct pids_cgroup, css); } static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) { return css_pids(pids->css.parent); } static struct cgroup_subsys_state * pids_css_alloc(struct cgroup_subsys_state *parent) { struct pids_cgroup *pids; pids = kzalloc_obj(struct pids_cgroup); if (!pids) return ERR_PTR(-ENOMEM); atomic64_set(&pids->limit, PIDS_MAX); return &pids->css; } static void pids_css_free(struct cgroup_subsys_state *css) { kfree(css_pids(css)); } static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids) { /* * This is racy, but we don't need perfectly accurate tallying of * the watermark, and this lets us avoid extra atomic overhead. */ if (nr_pids > READ_ONCE(p->watermark)) WRITE_ONCE(p->watermark, nr_pids); } /** * pids_cancel - uncharge the local pid count * @pids: the pid cgroup state * @num: the number of pids to cancel * * This function will WARN if the pid count goes under 0, because such a case is * a bug in the pids controller proper. */ static void pids_cancel(struct pids_cgroup *pids, int num) { /* * A negative count (or overflow for that matter) is invalid, * and indicates a bug in the `pids` controller proper. */ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); } /** * pids_uncharge - hierarchically uncharge the pid count * @pids: the pid cgroup state * @num: the number of pids to uncharge */ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } /** * pids_charge - hierarchically charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge * * This function does *not* follow the pid limit set. It cannot fail and the new * pid count may exceed the limit. This is only used for reverting failed * attaches, where there is no other way out than violating the limit. */ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); pids_update_watermark(p, new); } } /** * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ if (new > limit) { *fail = p; goto revert; } /* * Not technically accurate if we go over limit somewhere up * the hierarchy, but that's tolerable for the watermark. */ pids_update_watermark(p, new); } return 0; revert: for (q = pids; q != p; q = parent_pids(q)) pids_cancel(q, num); pids_cancel(p, num); return -EAGAIN; } static int pids_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *dst_css; cgroup_taskset_for_each(task, dst_css, tset) { struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; /* * No need to pin @old_css between here and cancel_attach() * because cgroup core protects it from being freed before * the migration completes or fails. */ old_css = task_css(task, pids_cgrp_id); old_pids = css_pids(old_css); pids_charge(pids, 1); pids_uncharge(old_pids, 1); } return 0; } static void pids_cancel_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *dst_css; cgroup_taskset_for_each(task, dst_css, tset) { struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; old_css = task_css(task, pids_cgrp_id); old_pids = css_pids(old_css); pids_charge(old_pids, 1); pids_uncharge(pids, 1); } } static void pids_event(struct pids_cgroup *pids_forking, struct pids_cgroup *pids_over_limit) { struct pids_cgroup *p = pids_forking; /* Only log the first time limit is hit. */ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { pr_info("cgroup: fork rejected by pids controller in "); pr_cont_cgroup_path(p->css.cgroup); pr_cont("\n"); } if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { cgroup_file_notify(&p->events_local_file); return; } atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); cgroup_file_notify(&pids_over_limit->events_local_file); for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { atomic64_inc(&p->events[PIDCG_MAX]); cgroup_file_notify(&p->events_file); } } /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct pids_cgroup *pids, *pids_over_limit; int err; pids = css_pids(cset->subsys[pids_cgrp_id]); err = pids_try_charge(pids, 1, &pids_over_limit); if (err) pids_event(pids, pids_over_limit); return err; } static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct pids_cgroup *pids; pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); pids_uncharge(pids, 1); } static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_subsys_state *css = of_css(of); struct pids_cgroup *pids = css_pids(css); int64_t limit; int err; buf = strstrip(buf); if (!strcmp(buf, PIDS_MAX_STR)) { limit = PIDS_MAX; goto set_limit; } err = kstrtoll(buf, 0, &limit); if (err) return err; if (limit < 0 || limit >= PIDS_MAX) return -EINVAL; set_limit: /* * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ atomic64_set(&pids->limit, limit); return nbytes; } static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); else seq_printf(sf, "%lld\n", limit); return 0; } static s64 pids_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct pids_cgroup *pids = css_pids(css); return atomic64_read(&pids->counter); } static s64 pids_peak_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct pids_cgroup *pids = css_pids(css); return READ_ONCE(pids->watermark); } static int __pids_events_show(struct seq_file *sf, bool local) { struct pids_cgroup *pids = css_pids(seq_css(sf)); enum pidcg_event pe = PIDCG_MAX; atomic64_t *events; if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { pe = PIDCG_FORKFAIL; local = true; } events = local ? pids->events_local : pids->events; seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe])); return 0; } static int pids_events_show(struct seq_file *sf, void *v) { __pids_events_show(sf, false); return 0; } static int pids_events_local_show(struct seq_file *sf, void *v) { __pids_events_show(sf, true); return 0; } static struct cftype pids_files[] = { { .name = "max", .write = pids_max_write, .seq_show = pids_max_show, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "peak", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = pids_peak_read, }, { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events.local", .seq_show = pids_events_local_show, .file_offset = offsetof(struct pids_cgroup, events_local_file), .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; static struct cftype pids_files_legacy[] = { { .name = "max", .write = pids_max_write, .seq_show = pids_max_show, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "peak", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = pids_peak_read, }, { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, .can_attach = pids_can_attach, .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, .legacy_cftypes = pids_files_legacy, .dfl_cftypes = pids_files, .threaded = true, };
21 21 21 9 9 9 9 9 9 9 9 9 9 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peer.h" #include "device.h" #include "queueing.h" #include "timers.h" #include "peerlookup.h" #include "noise.h" #include <linux/kref.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/list.h> static struct kmem_cache *peer_cache; static atomic64_t peer_counter = ATOMIC64_INIT(0); struct wg_peer *wg_peer_create(struct wg_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) { struct wg_peer *peer; int ret = -ENOMEM; lockdep_assert_held(&wg->device_update_lock); if (wg->num_peers >= MAX_PEERS_PER_DEVICE) return ERR_PTR(ret); peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))) goto err; peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); wg_prev_queue_init(&peer->tx_queue); wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state); netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll); napi_enable(&peer->napi); list_add_tail(&peer->peer_list, &wg->peer_list); INIT_LIST_HEAD(&peer->allowedips_list); wg_pubkey_hashtable_add(wg->peer_hashtable, peer); ++wg->num_peers; pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; err: kmem_cache_free(peer_cache, peer); return ERR_PTR(ret); } struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking peer reference without holding the RCU read lock"); if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) return NULL; return peer; } static void peer_make_dead(struct wg_peer *peer) { /* Remove from configuration-time lookup structures. */ list_del_init(&peer->peer_list); wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer, &peer->device->device_update_lock); wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer); /* Mark as dead, so that we don't allow jumping contexts after. */ WRITE_ONCE(peer->is_dead, true); /* The caller must now synchronize_net() for this to take effect. */ } static void peer_remove_after_dead(struct wg_peer *peer) { WARN_ON(!peer->is_dead); /* No more keypairs can be created for this peer, since is_dead protects * add_new_keypair, so we can now destroy existing ones. */ wg_noise_keypairs_clear(&peer->keypairs); /* Destroy all ongoing timers that were in-flight at the beginning of * this function. */ wg_timers_stop(peer); /* The transition between packet encryption/decryption queues isn't * guarded by is_dead, but each reference's life is strictly bounded by * two generations: once for parallel crypto and once for serial * ingestion, so we can simply flush twice, and be sure that we no * longer have references inside these queues. */ /* a) For encrypt/decrypt. */ flush_workqueue(peer->device->packet_crypt_wq); /* b.1) For send (but not receive, since that's napi). */ flush_workqueue(peer->device->packet_crypt_wq); /* b.2.1) For receive (but not send, since that's wq). */ napi_disable(&peer->napi); /* b.2.1) It's now safe to remove the napi struct, which must be done * here from process context. */ netif_napi_del(&peer->napi); /* Ensure any workstructs we own (like transmit_handshake_work or * clear_peer_work) no longer are in use. */ flush_workqueue(peer->device->handshake_send_wq); /* After the above flushes, a peer might still be active in a few * different contexts: 1) from xmit(), before hitting is_dead and * returning, 2) from wg_packet_consume_data(), before hitting is_dead * and returning, 3) from wg_receive_handshake_packet() after a point * where it has processed an incoming handshake packet, but where * all calls to pass it off to timers fails because of is_dead. We won't * have new references in (1) eventually, because we're removed from * allowedips; we won't have new references in (2) eventually, because * wg_index_hashtable_lookup will always return NULL, since we removed * all existing keypairs and no more can be created; we won't have new * references in (3) eventually, because we're removed from the pubkey * hash table, which allows for a maximum of one handshake response, * via the still-uncleared index hashtable entry, but not more than one, * and in wg_cookie_message_consume, the lookup eventually gets a peer * with a refcount of zero, so no new reference is taken. */ --peer->device->num_peers; wg_peer_put(peer); } /* We have a separate "remove" function make sure that all active places where * a peer is currently operating will eventually come to an end and not pass * their reference onto another context. */ void wg_peer_remove(struct wg_peer *peer) { if (unlikely(!peer)) return; lockdep_assert_held(&peer->device->device_update_lock); peer_make_dead(peer); synchronize_net(); peer_remove_after_dead(peer); } void wg_peer_remove_all(struct wg_device *wg) { struct wg_peer *peer, *temp; LIST_HEAD(dead_peers); lockdep_assert_held(&wg->device_update_lock); /* Avoid having to traverse individually for each one. */ wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { peer_make_dead(peer); list_add_tail(&peer->peer_list, &dead_peers); } synchronize_net(); list_for_each_entry_safe(peer, temp, &dead_peers, peer_list) peer_remove_after_dead(peer); } static void rcu_release(struct rcu_head *rcu) { struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. */ memzero_explicit(peer, sizeof(*peer)); kmem_cache_free(peer_cache, peer); } static void kref_release(struct kref *refcount) { struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount); pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr); /* Remove ourself from dynamic runtime lookup structures, now that the * last reference is gone. */ wg_index_hashtable_remove(peer->device->index_hashtable, &peer->handshake.entry); /* Remove any lingering packets that didn't have a chance to be * transmitted. */ wg_packet_purge_staged_packets(peer); /* Free the memory used. */ call_rcu(&peer->rcu, rcu_release); } void wg_peer_put(struct wg_peer *peer) { if (unlikely(!peer)) return; kref_put(&peer->refcount, kref_release); } int __init wg_peer_init(void) { peer_cache = KMEM_CACHE(wg_peer, 0); return peer_cache ? 0 : -ENOMEM; } void wg_peer_uninit(void) { kmem_cache_destroy(peer_cache); }
2 2 1 2 2 1 2 2 2 1 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> #include <linux/leafops.h> #include <asm/tlbflush.h> #include "internal.h" /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). */ static int real_depth(int depth) { if (depth == 3 && PTRS_PER_PMD == 1) depth = 2; if (depth == 2 && PTRS_PER_PUD == 1) depth = 1; if (depth == 1 && PTRS_PER_P4D == 1) depth = 0; return depth; } static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { const struct mm_walk_ops *ops = walk->ops; int err = 0; for (;;) { if (ops->install_pte && pte_none(ptep_get(pte))) { pte_t new_pte; err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, walk); if (err) break; set_pte_at(walk->mm, addr, pte, new_pte); /* Non-present before, so for arches that need it. */ if (!WARN_ON_ONCE(walk->no_vma)) update_mmu_cache(walk->vma, addr, pte); } else { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; } if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; pte++; } return err; } static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t *pte; int err = 0; spinlock_t *ptl; if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); if (walk->mm != &init_mm && addr < TASK_SIZE) pte_unmap(pte); } } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); pte_unmap_unlock(pte, ptl); } } if (!pte) walk->action = ACTION_AGAIN; return err; } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(3); pmd = pmd_offset(pud, addr); do { again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { if (has_install) err = __pte_alloc(walk->mm, pmd); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; /* * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ if (ops->pmd_entry) err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pmd_present(*pmd) && pmd_trans_huge(*pmd)) continue; } if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) continue; /* Nothing to do. */ err = walk_pte_range(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; } while (pmd++, addr = next, addr != end); return err; } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(2); pud = pud_offset(p4d, addr); do { again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { if (has_install) err = __pmd_alloc(walk->mm, pud, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; if (ops->pud_entry) err = ops->pud_entry(pud, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pud_present(*pud) && pud_trans_huge(*pud)) continue; } if (walk->vma) split_huge_pud(walk->vma, pud, addr); else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ if (pud_none(*pud)) goto again; err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, struct mm_walk *walk) { p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(1); p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (has_install) err = __pud_alloc(walk->mm, p4d, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_pud_range(p4d, addr, next, walk); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; if (walk->pgd) pgd = walk->pgd + pgd_index(addr); else pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (has_install) err = __p4d_alloc(walk->mm, pgd, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; if (!has_install) continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); return err; } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); return min(boundary, end); } static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; const struct mm_walk_ops *ops = walk->ops; int err = 0; hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); if (err) break; } while (addr = next, addr != end); hugetlb_vma_unlock_read(vma); return err; } #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* * Decide whether we really walk over the current vma on [@start, @end) * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; if (ops->test_walk) return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP * range, so we don't walk over it as we do for normal vmas. However, * Some callers are interested in handling hole range and they don't * want to just ignore any single address range. Such users certainly * define their ->pte_hole() callbacks, so let's delegate them to handle * vma(VM_PFNMAP). */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk); return err ? err : 1; } return 0; } static int __walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; bool is_hugetlb = is_vm_hugetlb_page(vma); /* We do not support hugetlb PTE installation. */ if (ops->install_pte && is_hugetlb) return -EINVAL; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } if (is_hugetlb) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); if (ops->post_vma) ops->post_vma(walk); return err; } static inline void process_mm_walk_lock(struct mm_struct *mm, enum page_walk_lock walk_lock) { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) mmap_assert_write_locked(mm); } static inline void process_vma_walk_lock(struct vm_area_struct *vma, enum page_walk_lock walk_lock) { #ifdef CONFIG_PER_VMA_LOCK switch (walk_lock) { case PGWALK_WRLOCK: vma_start_write(vma); break; case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; case PGWALK_VMA_RDLOCK_VERIFY: vma_assert_locked(vma); break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; } #endif } /* * See the comment for walk_page_range(), this performs the heavy lifting of the * operation, only sets no restrictions on how the walk proceeds. * * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; struct mm_walk walk = { .ops = ops, .mm = mm, .private = private, }; if (start >= end) return -EINVAL; if (!walk.mm) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for * controlling the pagewalk, so should never * be passed to the callers. */ err = 0; continue; } if (err < 0) break; err = __walk_page_range(start, next, &walk); } if (err) break; } while (start = next, start < end); return err; } /* * Determine if the walk operations specified are permitted to be used for a * page table walk. * * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * * Internal memory management code can use *_unsafe() functions to be able to * use all page walking operations. */ static bool check_ops_safe(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory * management logic and subject to many subtle locking, security and * cache considerations so we cannot permit other users to do so, and * certainly not for exported symbols. */ if (ops->install_pte) return false; return true; } /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @private: private data for callbacks' usage * * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller * with caller specific value. * - <0 : failed to handle the current entry, and return to the caller * with error code. * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * If operations need to be staged before and committed after a vma is walked, * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), * since it is intended to handle commit-type operations, can't return any * errors. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. * * Locking: * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { if (!check_ops_safe(ops)) return -EINVAL; return walk_page_range_mm_unsafe(mm, start, end, ops, private); } /** * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { /* * Kernel intermediate page tables are usually not freed, so the mmap * read lock is sufficient. But there are some exceptions. * E.g. memory hot-remove. In which case, the mmap lock is insufficient * to prevent the intermediate kernel pages tables belonging to the * specified address range from being freed. The caller should take * other actions to prevent this race. */ mmap_assert_locked(&init_mm); return walk_kernel_page_table_range_lockless(start, end, ops, pgd, private); } /* * Use this function to walk the kernel page tables locklessly. It should be * guaranteed that the caller has exclusive access over the range they are * operating on - that there should be no concurrent access, for example, * changing permissions for vmalloc objects. */ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true }; if (start >= end) return -EINVAL; if (!check_ops_safe(ops)) return -EINVAL; return walk_pgd_range(start, end, &walk); } /** * walk_page_range_debug - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. * * This is for debugging purposes ONLY. */ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = mm, .pgd = pgd, .private = private, .no_vma = true }; /* For convenience, we allow traversal of kernel mappings. */ if (mm == &init_mm) return walk_kernel_page_table_range(start, end, ops, pgd, private); if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_safe(ops)) return -EINVAL; /* * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should * be held. */ mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (start >= end || !walk.mm) return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { if (!check_ops_safe(ops)) return -EINVAL; return walk_page_range_vma_unsafe(vma, start, end, ops, private); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (!walk.mm) return -EINVAL; if (!check_ops_safe(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space * @first_index: First page offset in the address_space * @nr: Number of incremental page offsets to cover * @ops: operation to call during the walk * @private: private data for callbacks' usage * * This function walks all memory areas mapped into a struct address_space. * The walk is limited to only the given page-size index range, but if * the index boundaries cross a huge page-table entry, that entry will be * included. * * Also see walk_page_range() for additional information. * * Locking: * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, * except for immutable flags. Callers requiring this shouldn't use * this function. * * Return: 0 on success, negative error code on failure, positive number on * caller defined premature termination. */ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .private = private, }; struct vm_area_struct *vma; pgoff_t vba, vea, cba, cea; unsigned long start_addr, end_addr; int err = 0; if (!check_ops_safe(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { /* Clip to the vma */ vba = vma->vm_pgoff; vea = vba + vma_pages(vma); cba = first_index; cba = max(cba, vba); cea = first_index + nr; cea = min(cea, vea); start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; if (start_addr >= end_addr) continue; walk.vma = vma; walk.mm = vma->vm_mm; err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) { err = 0; break; } else if (err < 0) break; err = __walk_page_range(start_addr, end_addr, &walk); if (err) break; } return err; } /** * folio_walk_start - walk the page tables to a folio * @fw: filled with information on success. * @vma: the VMA. * @addr: the virtual address to use for the page table walk. * @flags: flags modifying which folios to walk to. * * Walk the page tables using @addr in a given @vma to a mapped folio and * return the folio, making sure that the page table entry referenced by * @addr cannot change until folio_walk_end() was called. * * As default, this function returns only folios that are not special (e.g., not * the zeropage) and never returns folios that are supposed to be ignored by the * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * * As default, this function only considers present page table entries. * If requested, it will also consider migration entries. * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * * On success, @fw is filled and the function returns the folio while the PTL * is still held and folio_walk_end() must be called to clean up, * releasing any held locks. The returned folio must *not* be used after the * call to folio_walk_end(), unless a short-term folio reference is taken before * that call. * * @fw->page will correspond to the page that is effectively referenced by * @addr. However, for migration entries and shared zeropages @fw->page is * set to NULL. Note that large folios might be mapped by multiple page table * entries, and this function will always only lookup a single entry as * specified by @addr, which might or might not cover more than a single page of * the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or * to carelessly modify page content. This function may *only* be used to grab * short-term folio references, never to grab long-term folio references. * * Using the page table entry pointers in @fw for reading or modifying the * entry should be avoided where possible: however, there might be valid * use cases. * * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. * For example, PMD page table sharing might require prior unsharing. Also, * logical hugetlb entries might span multiple physical page table entries, * which *must* be modified in a single operation (set_huge_pte_at(), * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * * The mmap lock must be held in read mode. * * Return: folio pointer on success, otherwise NULL. */ struct folio *folio_walk_start(struct folio_walk *fw, struct vm_area_struct *vma, unsigned long addr, folio_walk_flags_t flags) { unsigned long entry_size; bool expose_page = true; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; spinlock_t *ptl; pgd_t *pgdp; p4d_t *p4dp; mmap_assert_locked(vma->vm_mm); vma_pgtable_walk_begin(vma); if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) goto not_found; pgdp = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgdp)) goto not_found; p4dp = p4d_offset(pgdp, addr); if (p4d_none_or_clear_bad(p4dp)) goto not_found; pudp = pud_offset(p4dp, addr); pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); entry_size = PUD_SIZE; fw->level = FW_LEVEL_PUD; fw->pudp = pudp; fw->pud = pud; if (pud_none(pud)) { spin_unlock(ptl); goto not_found; } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; } else if (pud_present(pud)) { page = vm_normal_page_pud(vma, addr, pud); if (page) goto found; } /* * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ spin_unlock(ptl); goto not_found; } pmd_table: VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); entry_size = PMD_SIZE; fw->level = FW_LEVEL_PMD; fw->pmdp = pmdp; fw->pmd = pmd; if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { page = vm_normal_page_pmd(vma, addr, pmd); if (page) { goto found; } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); expose_page = false; goto found; } } else if ((flags & FW_MIGRATION) && pmd_is_migration_entry(pmd)) { const softleaf_t entry = softleaf_from_pmd(pmd); page = softleaf_to_page(entry); expose_page = false; goto found; } spin_unlock(ptl); goto not_found; } pte_table: VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; pte = ptep_get(ptep); entry_size = PAGE_SIZE; fw->level = FW_LEVEL_PTE; fw->ptep = ptep; fw->pte = pte; if (pte_present(pte)) { page = vm_normal_page(vma, addr, pte); if (page) goto found; if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); expose_page = false; goto found; } } else if (!pte_none(pte)) { const softleaf_t entry = softleaf_from_pte(pte); if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { page = softleaf_to_page(entry); expose_page = false; goto found; } } pte_unmap_unlock(ptep, ptl); not_found: vma_pgtable_walk_end(vma); return NULL; found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; return page_folio(page); }
1 1 1 1 1 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * sch_plug.c Queue traffic until an explicit release command * * There are two ways to use this qdisc: * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. * * 2. For network output buffering (a.k.a output commit) functionality. * Output commit property is commonly used by applications using checkpoint * based fault-tolerance to ensure that the checkpoint from which a system * is being restored is consistent w.r.t outside world. * * Consider for e.g. Remus - a Virtual Machine checkpointing system, * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated * asynchronously to the backup host, while the VM continues executing the * next epoch speculatively. * * The following is a typical sequence of output buffer operations: * 1.At epoch i, start_buffer(i) * 2. At end of epoch i (i.e. after 50ms): * 2.1 Stop VM and take checkpoint(i). * 2.2 start_buffer(i+1) and Resume VM * 3. While speculatively executing epoch(i+1), asynchronously replicate * checkpoint(i) to backup host. * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) * Thus, this Qdisc would receive the following sequence of commands: * TCQ_PLUG_BUFFER (epoch i) * .. TCQ_PLUG_BUFFER (epoch i+1) * ....TCQ_PLUG_RELEASE_ONE (epoch i) * ......TCQ_PLUG_BUFFER (epoch i+2) * ........ */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> /* * State of the queue, when used for network output buffering: * * plug(i+1) plug(i) head * ------------------+--------------------+----------------> * | | * | | * pkts_current_epoch| pkts_last_epoch |pkts_to_release * ----------------->|<--------+--------->|+---------------> * v v * */ struct plug_sched_data { /* If true, the dequeue function releases all packets * from head to end of the queue. The queue turns into * a pass-through queue for newly arriving packets. */ bool unplug_indefinite; bool throttled; /* Queue Limit in bytes */ u32 limit; /* Number of packets (output) from the current speculatively * executing epoch. */ u32 pkts_current_epoch; /* Number of packets corresponding to the recently finished * epoch. These will be released when we receive a * TCQ_PLUG_RELEASE_ONE command. This command is typically * issued after committing a checkpoint at the target. */ u32 pkts_last_epoch; /* * Number of packets from the head of the queue, that can * be released (committed checkpoint). */ u32 pkts_to_release; }; static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); if (likely(sch->qstats.backlog + skb->len <= q->limit)) { if (!q->unplug_indefinite) q->pkts_current_epoch++; return qdisc_enqueue_tail(skb, sch); } return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); if (q->throttled) return NULL; if (!q->unplug_indefinite) { if (!q->pkts_to_release) { /* No more packets to dequeue. Block the queue * and wait for the next release command. */ q->throttled = true; return NULL; } q->pkts_to_release--; } return qdisc_dequeue_head(sch); } static int plug_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); q->pkts_current_epoch = 0; q->pkts_last_epoch = 0; q->pkts_to_release = 0; q->unplug_indefinite = false; if (opt == NULL) { q->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; } q->throttled = true; return 0; } /* Receives 4 types of messages: * TCQ_PLUG_BUFFER: Inset a plug into the queue and * buffer any incoming packets * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head * to beginning of the next plug. * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. * Stop buffering packets until the next TCQ_PLUG_BUFFER * command is received (just act as a pass-thru queue). * TCQ_PLUG_LIMIT: Increase/decrease queue size */ static int plug_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); struct tc_plug_qopt *msg; msg = nla_data(opt); if (nla_len(opt) < sizeof(*msg)) return -EINVAL; switch (msg->action) { case TCQ_PLUG_BUFFER: /* Save size of the current buffer */ q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: /* Add packets from the last complete buffer to the * packets to be released set. */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: q->unplug_indefinite = true; q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: /* Limit is supplied in bytes */ q->limit = msg->limit; break; default: return -EINVAL; } return 0; } static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .id = "plug", .priv_size = sizeof(struct plug_sched_data), .enqueue = plug_enqueue, .dequeue = plug_dequeue, .peek = qdisc_peek_dequeued, .init = plug_init, .change = plug_change, .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { return register_qdisc(&plug_qdisc_ops); } static void __exit plug_module_exit(void) { unregister_qdisc(&plug_qdisc_ops); } module_init(plug_module_init) module_exit(plug_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H /* * This file contains the definitions for the x86 IO instructions * inb/inw/inl/outb/outw/outl and the "string versions" of the same * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" * versions of the single-IO instructions (inb_p/inw_p/..). * * This file is not meant to be obfuscating: it's just complicated * to (a) handle it all in a way that makes gcc able to optimize it * as well as possible and (b) trying to avoid writing the same thing * over and over again with slight variations and possibly making a * mistake somewhere. */ /* * Thanks to James van Artsdalen for a better timing-fix than * the two short jumps: using outb's to a nonexistent port seems * to guarantee better timings even on fast machines. * * On the other hand, I'd like to be sure of a non-existent port: * I feel a bit unsafe about using 0x80 (should be safe, though) * * Linus */ /* * Bit simplified and optimized by Jan Hubicka * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. * * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, * isa_read[wl] and isa_write[wl] fixed * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/string.h> #include <linux/compiler.h> #include <linux/cc_platform.h> #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> #include <asm/shared/io.h> #include <asm/special_insns.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ :"m" (*(volatile type __force *)addr) barrier); return ret; } #define build_mmio_write(name, size, type, reg, barrier) \ static inline void name(type val, volatile void __iomem *addr) \ { asm volatile("mov" size " %0,%1": :reg (val), \ "m" (*(volatile type __force *)addr) barrier); } build_mmio_read(readb, "b", unsigned char, "=q", :"memory") build_mmio_read(readw, "w", unsigned short, "=r", :"memory") build_mmio_read(readl, "l", unsigned int, "=r", :"memory") build_mmio_read(__readb, "b", unsigned char, "=q", ) build_mmio_read(__readw, "w", unsigned short, "=r", ) build_mmio_read(__readl, "l", unsigned int, "=r", ) build_mmio_write(writeb, "b", unsigned char, "q", :"memory") build_mmio_write(writew, "w", unsigned short, "r", :"memory") build_mmio_write(writel, "l", unsigned int, "r", :"memory") build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) #define readb readb #define readw readw #define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) #define __raw_readb __readb #define __raw_readw __readw #define __raw_readl __readl #define writeb writeb #define writew writew #define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") build_mmio_read(__readq, "q", u64, "=r", ) build_mmio_write(writeq, "q", u64, "r", :"memory") build_mmio_write(__writeq, "q", u64, "r", ) #define readq_relaxed(a) __readq(a) #define writeq_relaxed(v, a) __writeq(v, a) #define __raw_readq __readq #define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq #define writeq writeq #endif #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); /** * virt_to_phys - map virtual addresses to physical * @address: address to remap * * The returned physical address is the physical (CPU) mapping for * the memory address given. It is only valid to use this function on * addresses directly mapped or allocated via kmalloc. * * This function does not give bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } #define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual * @address: address to remap * * The returned virtual address is a current CPU mapping for * the memory address given. It is only valid to use this function on * addresses that have a kernel mapping * * This function does not handle bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } #define phys_to_virt phys_to_virt /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable * promotions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { return (unsigned int)virt_to_phys(address); } #define isa_bus_to_virt phys_to_virt /* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, pgprot_t prot); #define ioremap_prot ioremap_prot extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); #define arch_memremap_wb arch_memremap_wb /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory * @size: size of the resource to map * * ioremap performs a platform specific sequence of operations to * make bus memory CPU accessible via the readb/readw/readl/writeb/ * writew/writel functions and the other mmio helpers. The returned * address is not guaranteed to be usable directly as a virtual * address. * * If the area you are trying to map is a PCI BAR you should have a * look at pci_iomap(). */ void __iomem *ioremap(resource_size_t offset, unsigned long size); #define ioremap ioremap extern void iounmap(volatile void __iomem *addr); #define iounmap iounmap #ifdef __KERNEL__ void memcpy_fromio(void *, const volatile void __iomem *, size_t); void memcpy_toio(volatile void __iomem *, const void *, size_t); void memset_io(volatile void __iomem *, int, size_t); #define memcpy_fromio memcpy_fromio #define memcpy_toio memcpy_toio #define memset_io memset_io #ifdef CONFIG_X86_64 /* * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy * loop. */ static inline void __iowrite32_copy(void __iomem *to, const void *from, size_t count) { asm volatile("rep movsl" : "=&c"(count), "=&D"(to), "=&S"(from) : "0"(count), "1"(to), "2"(from) : "memory"); } #define __iowrite32_copy __iowrite32_copy #endif /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values * are physical addresses. The following constant pointer can be * used as the IO-area pointer (it can be iounmapped as well, so the * analogy with PCI is quite large): */ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) #endif /* __KERNEL__ */ extern void native_io_delay(void); extern int io_delay_type; extern void io_delay_init(void); #if defined(CONFIG_PARAVIRT) #include <asm/paravirt.h> #else static inline void slow_down_io(void) { native_io_delay(); #ifdef REALLY_SLOW_IO native_io_delay(); native_io_delay(); native_io_delay(); #endif } #endif #define BUILDIO(bwl, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ static inline type in##bwl##_p(u16 port) \ { \ type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ type *value = (type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } \ \ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ type *value = (type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } BUILDIO(b, u8) BUILDIO(w, u16) BUILDIO(l, u32) #undef BUILDIO #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p #define insb insb #define insw insw #define insl insl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p #define outsb outsb #define outsw outsw #define outsl outsl extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define xlate_dev_mem_ptr xlate_dev_mem_ptr #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); #define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); #define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> #undef PCI_IOBASE #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif #ifdef CONFIG_X86_PAT extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif #ifdef CONFIG_AMD_MEM_ENCRYPT extern bool arch_memremap_can_ram_remap(resource_size_t offset, unsigned long size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap extern bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size); #else static inline bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) { return true; } #endif /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: source * @count: number of 512 bits quantities to submit * * Submit data from kernel space to MMIO space, in units of 512 bits at a * time. Order of access is not guaranteed, nor is a memory barrier * performed afterwards. * * Warning: Do not use this helper unless your driver has checked that the CPU * instruction is supported on the platform. */ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, size_t count) { const u8 *from = src; const u8 *end = from + count * 64; while (from < end) { movdir64b_io(dst, from); from += 64; } } #endif /* _ASM_X86_IO_H */
720 721 561 2 2 2 308 309 294 80 283 234 9 62 386 46 550 18 18 306 8 50 20 20 13 20 13 4 4 4 4 20 20 13 5 1 3 3 20 20 8 5 7 6 20 20 20 1 20 20 13 13 232 233 52 233 77 2 2 8 155 41 151 91 178 41 9 5 27 24 79 79 62 37 35 14 6 321 321 321 320 166 320 320 319 320 320 283 302 301 302 111 112 301 320 222 223 196 27 221 223 220 137 14 147 3 144 47 45 1 2 3 3 127 2 6 142 115 118 205 6 6 1 6 6 6 187 11 112 123 206 10 198 136 99 10 1 3 3 3 3 3 3 3 195 21 288 288 289 287 191 191 233 196 25 12 203 24 6 18 6 3 16 223 208 3 3 185 23 208 4 47 1 1 1 1 189 19 12 2 40 1030 818 1012 592 1033 1025 1035 1029 1033 4 595 1035 1032 43 22 43 1 43 35 22 7 21 43 34 9 2 7 1 42 274 2 3 275 274 274 3 311 62 296 298 295 296 295 296 296 297 296 297 297 274 296 274 273 16 296 271 242 294 288 293 4 270 4 196 189 294 73 290 318 318 264 302 50 317 317 5 5 5 317 5 280 280 249 277 272 57 296 306 262 50 318 318 319 319 288 316 313 314 16 318 263 721 720 722 719 177 720 719 720 719 721 722 1 13 719 716 170 717 716 720 718 703 562 441 438 563 562 278 5 706 706 703 705 706 706 703 561 437 438 57 27 313 71 316 317 317 313 317 317 317 317 27 313 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free(struct net *net, struct fib6_node *fn) { kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc_obj(*table, GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; new_tb = fib6_alloc_table(net, id); if (!new_tb) return NULL; spin_lock_bh(&net->ipv6.fib_table_hash_lock); tb = fib6_get_table(net, id); if (unlikely(tb)) { spin_unlock_bh(&net->ipv6.fib_table_hash_lock); kfree(new_tb); return tb; } fib6_link_table(net, new_tb); spin_unlock_bh(&net->ipv6.fib_table_hash_lock); return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct hlist_head *head; struct fib6_table *tb; if (!id) id = RT6_TABLE_MAIN; head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; /* See comment in fib6_link_table(). RCU is not required, * but rcu_dereference_raw() is used to avoid data-race. */ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) if (tb->tb6_id == id) return tb; return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; unsigned int nsiblings; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; nsiblings = READ_ONCE(rt->fib6_nsiblings); if (nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc_obj(*w, GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc_obj(*w, GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_info *arg = _arg; __fib6_drop_pcpu_from(nh, arg); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt); if (rt->nh) { spin_lock(&rt->nh->lock); if (!list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); spin_unlock(&rt->nh->lock); } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT)) && (iter->nh || !iter->fib6_nh->fib_nh_gw_family)) { iter->fib6_flags &= ~RTF_ADDRCONF; iter->fib6_flags &= ~RTF_PREFIX_RT; } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) WRITE_ONCE(rt->fib6_nsiblings, rt->fib6_nsiblings + 1); } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail_rcu(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings + 1); BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); rcu_read_unlock(); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rcu_read_lock(); rt6_multipath_rebalance(next_sibling); rcu_read_unlock(); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { int err; spin_lock(&rt->nh->lock); if (rt->nh->dead) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = fib6_add_rt2node(fn, rt, info, extack, purge_list); if (!err) list_add(&rt->nh_list, &rt->nh->f6i_list); } spin_unlock(&rt->nh->lock); return err; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif if (rt->nh) err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); else err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { struct fib6_info *iter, *next; list_for_each_entry_safe(iter, next, &purge_list, purge_link) { list_del(&iter->purge_link); fib6_purge_rt(iter, fn, info->nl_net); fib6_info_release(iter); } __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; int ip6_rt_gc_interval; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval); gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc_obj(*net->ipv6.rt6_stats); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; spin_lock_init(&net->ipv6.fib_table_hash_lock); net->ipv6.fib6_main_tbl = kzalloc_obj(*net->ipv6.fib6_main_tbl); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc_obj(*net->ipv6.fib6_local_tbl); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .dumpit = inet6_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
12 12 12 10 10 2 2 1 1 4 4 8 8 7 7 7 3 5 13 13 13 4 1 3 1 2 5 1 1 1 1 1 7 7 5 7 5 13 4 4 5 1 4 5 7 7 7 7 7 7 7 7 17 1 1 7 31 31 34 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com> * - reset skb->pkt_type on incoming packets when MAC was changed * - see that changed MAC is saddr for outgoing packets * Oct 20, 2001: Ard van Breeman: * - Fix MC-list, finally. * - Flush MC-list on VLAN destroy. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/phy.h> #include <net/arp.h> #include <net/macsec.h> #include <net/netdev_lock.h> #include "vlan.h" #include "vlanproc.h" #include <linux/if_vlan.h> #include <linux/netpoll.h> /* * Create the VLAN header for an arbitrary protocol layer * * saddr=NULL means use device source address * daddr=NULL means leave destination address (eg unresolved arp) * * This is called when the SKB is moving down the stack towards the * physical devices. */ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_hdr *vhdr; unsigned int vhdrlen = 0; u16 vlan_tci = 0; int rc; if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) { vhdr = skb_push(skb, VLAN_HLEN); vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); vhdr->h_vlan_TCI = htons(vlan_tci); /* * Set the protocol type. For a packet of type ETH_P_802_3/2 we * put the length in here instead. */ if (type != ETH_P_802_3 && type != ETH_P_802_2) vhdr->h_vlan_encapsulated_proto = htons(type); else vhdr->h_vlan_encapsulated_proto = htons(len); skb->protocol = vlan->vlan_proto; type = ntohs(vlan->vlan_proto); vhdrlen = VLAN_HLEN; } /* Before delegating work to the lower layer, enter our MAC-address */ if (saddr == NULL) saddr = dev->dev_addr; /* Now make the underlying real hard header */ dev = vlan->real_dev; rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen); if (rc > 0) rc += vhdrlen; return rc; } static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (vlan->flags & VLAN_FLAG_REORDER_HDR || veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci); } skb->dev = vlan->real_dev; len = skb->len; if (unlikely(netpoll_tx_running(dev))) return vlan_netpoll_send_skb(vlan, skb); ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->tx_packets); u64_stats_add(&stats->tx_bytes, len); u64_stats_update_end(&stats->syncp); } else { this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; } static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; unsigned int max_mtu = real_dev->mtu; if (netif_reduces_vlan_mtu(real_dev)) max_mtu -= VLAN_HLEN; if (max_mtu < new_mtu) return -ERANGE; WRITE_ONCE(dev->mtu, new_mtu); return 0; } void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) vlan->nr_ingress_mappings--; else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio) vlan->nr_ingress_mappings++; vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio; } int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *mp = NULL; struct vlan_priority_tci_mapping *np; u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; /* See if a priority mapping exists.. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; while (mp) { if (mp->priority == skb_prio) { if (mp->vlan_qos && !vlan_qos) vlan->nr_egress_mappings--; else if (!mp->vlan_qos && vlan_qos) vlan->nr_egress_mappings++; mp->vlan_qos = vlan_qos; return 0; } mp = mp->next; } /* Create a new mapping then. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; np = kmalloc_obj(struct vlan_priority_tci_mapping); if (!np) return -ENOBUFS; np->next = mp; np->priority = skb_prio; np->vlan_qos = vlan_qos; /* Before inserting this element in hash table, make sure all its fields * are committed to memory. * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() */ smp_wmb(); vlan->egress_priority_map[skb_prio & 0xF] = np; if (vlan_qos) vlan->nr_egress_mappings++; return 0; } /* Flags are defined in the vlan_flags enum in * include/uapi/linux/if_vlan.h file. */ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) { if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); else vlan_gvrp_request_leave(dev); } if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) { if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); else vlan_mvrp_request_leave(dev); } return 0; } void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev) { if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; int err; if (!(real_dev->flags & IFF_UP) && !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; } ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); if (netif_carrier_ok(real_dev) && !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; out: netif_carrier_off(dev); return err; } static int vlan_dev_stop(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_off(dev); return 0; } static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct sockaddr *addr = p; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (!(dev->flags & IFF_UP)) goto out; if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: eth_hw_addr_set(dev, addr->sa_data); return 0; } static int vlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return generic_hwtstamp_get_lower(real_dev, cfg); } static int vlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (!net_eq(dev_net(dev), dev_net(real_dev))) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct ifreq ifrr; int err = -EOPNOTSUPP; strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: err = dev_eth_ioctl(real_dev, &ifrr, cmd); break; } if (!err) ifr->ifr_ifru = ifrr.ifr_ifru; return err; } static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) err = ops->ndo_neigh_setup(real_dev, pa); return err; } #if IS_ENABLED(CONFIG_FCOE) static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_setup) rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc); return rc; } static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int len = 0; if (ops->ndo_fcoe_ddp_done) len = ops->ndo_fcoe_ddp_done(real_dev, xid); return len; } static int vlan_dev_fcoe_enable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_enable) rc = ops->ndo_fcoe_enable(real_dev); return rc; } static int vlan_dev_fcoe_disable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_disable) rc = ops->ndo_fcoe_disable(real_dev); return rc; } static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_target) rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); return rc; } #endif #ifdef NETDEV_FCOE_WWNN static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_get_wwn) rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } #endif static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL); } static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; if (saddr == NULL) saddr = dev->dev_addr; return dev_hard_header(skb, real_dev, type, daddr, saddr, len); } static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static const struct device_type vlan_type = { .name = "vlan", }; static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_MASTER | IFF_SLAVE); dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) dev->state |= (1 << __LINK_STATE_NOCARRIER); dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_FCOE_CRC | NETIF_F_FSO; if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; dev->features |= dev->hw_features; dev->lltx = true; dev->fcoe_mtu = true; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~(NETIF_F_FCOE_CRC | NETIF_F_FSO); dev->hw_enc_features = vlan_tnl_features(real_dev); dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); #if IS_ENABLED(CONFIG_FCOE) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif dev->needed_headroom = real_dev->needed_headroom; if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; } dev->netdev_ops = &vlan_netdev_ops; SET_NETDEV_DEVTYPE(dev, &vlan_type); netdev_lockdep_set_classes(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; /* Get vlan's reference to real_dev */ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL); return 0; } /* Note: this function might be called multiple times for the same device. */ void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; kfree(pm); } } } static void vlan_dev_uninit(struct net_device *dev) { vlan_dev_free_egress_priority(dev); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; netdev_features_t lower_features; lower_features = netdev_intersect_features((real_dev->vlan_features | NETIF_F_RXCSUM), real_dev->features); /* Add HW_CSUM setting to preserve user ability to control * checksum offload on the vlan device. */ if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); return features; } static int vlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, vlan_fullname, sizeof(info->driver)); strscpy(info->version, vlan_version, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); } static int vlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return ethtool_get_ts_info_by_layer(vlan->real_dev, info); } static void vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; int i; for_each_possible_cpu(i) { u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; unsigned int start; p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; stats->multicast += rxmulticast; stats->tx_packets += txpackets; stats->tx_bytes += txbytes; /* rx_errors & tx_dropped are u32 */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; } #ifdef CONFIG_NET_POLL_CONTROLLER static void vlan_dev_poll_controller(struct net_device *dev) { return; } static int vlan_dev_netpoll_setup(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; netpoll = kzalloc_obj(*netpoll); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void vlan_dev_netpoll_cleanup(struct net_device *dev) { struct vlan_dev_priv *vlan= vlan_dev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int vlan_dev_get_iflink(const struct net_device *dev) { const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return READ_ONCE(real_dev->ifindex); } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); path->type = DEV_PATH_VLAN; path->encap.id = vlan->vlan_id; path->encap.proto = vlan->vlan_proto; path->dev = ctx->dev; ctx->dev = vlan->real_dev; if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) return -ENOSPC; ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; ctx->num_vlans++; return 0; } #if IS_ENABLED(CONFIG_MACSEC) static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx) { return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops; } static int vlan_macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { if (unlikely(!func)) return 0; return (*func)(ctx); } static int vlan_macsec_dev_open(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_open, ctx); } static int vlan_macsec_dev_stop(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_stop, ctx); } static int vlan_macsec_add_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_secy, ctx); } static int vlan_macsec_upd_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_secy, ctx); } static int vlan_macsec_del_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_secy, ctx); } static int vlan_macsec_add_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsc, ctx); } static int vlan_macsec_upd_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx); } static int vlan_macsec_del_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsc, ctx); } static int vlan_macsec_add_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsa, ctx); } static int vlan_macsec_upd_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx); } static int vlan_macsec_del_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsa, ctx); } static int vlan_macsec_add_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_txsa, ctx); } static int vlan_macsec_upd_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_txsa, ctx); } static int vlan_macsec_del_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_txsa, ctx); } static int vlan_macsec_get_dev_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx); } static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx); } static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx); } static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx); } static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx); } static const struct macsec_ops macsec_offload_ops = { /* Device wide */ .mdo_dev_open = vlan_macsec_dev_open, .mdo_dev_stop = vlan_macsec_dev_stop, /* SecY */ .mdo_add_secy = vlan_macsec_add_secy, .mdo_upd_secy = vlan_macsec_upd_secy, .mdo_del_secy = vlan_macsec_del_secy, /* Security channels */ .mdo_add_rxsc = vlan_macsec_add_rxsc, .mdo_upd_rxsc = vlan_macsec_upd_rxsc, .mdo_del_rxsc = vlan_macsec_del_rxsc, /* Security associations */ .mdo_add_rxsa = vlan_macsec_add_rxsa, .mdo_upd_rxsa = vlan_macsec_upd_rxsa, .mdo_del_rxsa = vlan_macsec_del_rxsa, .mdo_add_txsa = vlan_macsec_add_txsa, .mdo_upd_txsa = vlan_macsec_upd_txsa, .mdo_del_txsa = vlan_macsec_del_txsa, /* Statistics */ .mdo_get_dev_stats = vlan_macsec_get_dev_stats, .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats, .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats, .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats, .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats, }; #endif static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, }; static const struct net_device_ops vlan_netdev_ops = { .ndo_change_mtu = vlan_dev_change_mtu, .ndo_init = vlan_dev_init, .ndo_uninit = vlan_dev_uninit, .ndo_open = vlan_dev_open, .ndo_stop = vlan_dev_stop, .ndo_start_xmit = vlan_dev_hard_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup, .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, .ndo_fcoe_enable = vlan_dev_fcoe_enable, .ndo_fcoe_disable = vlan_dev_fcoe_disable, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif #ifdef NETDEV_FCOE_WWNN .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, #endif #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = vlan_dev_poll_controller, .ndo_netpoll_setup = vlan_dev_netpoll_setup, .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, .ndo_hwtstamp_get = vlan_hwtstamp_get, .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; /* Get rid of the vlan's reference to real_dev */ netdev_put(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->netdev_ops = &vlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; #if IS_ENABLED(CONFIG_MACSEC) dev->macsec_ops = &macsec_offload_ops; #endif dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; eth_zero_addr(dev->broadcast); }
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_objs(*vp_dev->msix_names, nvectors); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kzalloc_objs(*vp_dev->msix_affinity_masks, nvectors); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc_obj(*info); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kzalloc_objs(*vp_dev->vqs, nvqs); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kzalloc_objs(*vp_dev->vqs, nvqs); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc_obj(struct virtio_pci_device); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1");
1304 1306 1087 1169 166 33 23 193 489 585 586 42 498 374 166 282 11 11 447 620 619 620 641 620 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); WRITE_ONCE(p->data[index], val); } #define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) #define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) #define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct hlist_node hash; struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; u8 nud_state; u8 type; u8 dead; u8 protocol; u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; u8 primary_key[]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; union { struct list_head free_node; struct rcu_head rcu; }; u32 flags; u8 protocol; bool permanent; u32 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE BIT(0) #define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) #define NEIGH_UPDATE_F_USE BIT(3) #define NEIGH_UPDATE_F_MANAGED BIT(4) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) #define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 #define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) #define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; #define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) #define neigh_for_each_in_bucket_rcu(pos, head) \ hlist_for_each_entry_rcu(pos, head, hash) #define neigh_for_each_in_bucket_safe(pos, tmp, head) \ hlist_for_each_entry_safe(pos, tmp, head, hash) static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } static inline void neigh_confirm(struct neighbour *n) { if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); static inline void neigh_set_reach_time(struct neigh_parms *p) { unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, u32 flags, u8 protocol, bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static __always_inline int neigh_event_send_probe(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) return __neigh_event_send(neigh, skb, immediate_ok); return 0; } static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { return neigh_event_send_probe(neigh, skb, true); } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif
20 309 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #ifndef __ASSEMBLER__ #include <asm/paravirt-base.h> #endif #include <asm/paravirt_types.h> #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/cpumask.h> #include <asm/frame.h> /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(pv_ops, cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(pv_ops, cpu.io_delay); PVOP_VCALL0(pv_ops, cpu.io_delay); PVOP_VCALL0(pv_ops, cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(pv_ops, mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(pv_ops, mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(pv_ops, mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(pv_ops, mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(pv_ops, mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(pv_ops, mmu.notify_page_enc_status_changed, pfn, npages, enc); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(pv_ops, irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(pv_ops, irq.halt); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(pv_ops, cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(pv_ops, cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, pv_ops, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(pv_ops, cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, pv_ops, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(pv_ops, cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, pv_ops, mmu.read_cr2, "mov %%cr2, %%rax", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(pv_ops, mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, pv_ops, mmu.read_cr3, "mov %%cr3, %%rax", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(pv_ops, mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_ops, cpu.write_cr4, x); } static inline u64 paravirt_read_msr(u32 msr) { return PVOP_CALL1(u64, pv_ops, cpu.read_msr, msr); } static inline void paravirt_write_msr(u32 msr, u64 val) { PVOP_VCALL2(pv_ops, cpu.write_msr, msr, val); } static inline int paravirt_read_msr_safe(u32 msr, u64 *val) { return PVOP_CALL2(int, pv_ops, cpu.read_msr_safe, msr, val); } static inline int paravirt_write_msr_safe(u32 msr, u64 val) { return PVOP_CALL2(int, pv_ops, cpu.write_msr_safe, msr, val); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) static __always_inline void wrmsr(u32 msr, u32 low, u32 high) { paravirt_write_msr(msr, (u64)high << 32 | low); } #define rdmsrq(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrq(u32 msr, u64 val) { paravirt_write_msr(msr, val); } static inline int wrmsrq_safe(u32 msr, u64 val) { return paravirt_write_msr_safe(msr, val); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ u64 _l; \ int _err = paravirt_read_msr_safe((msr), &_l); \ (*a) = (u32)_l; \ (*b) = (u32)(_l >> 32); \ _err; \ }) static __always_inline int rdmsrq_safe(u32 msr, u64 *p) { return paravirt_read_msr_safe(msr, p); } static __always_inline u64 rdpmc(int counter) { return PVOP_CALL1(u64, pv_ops, cpu.read_pmc, counter); } static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(pv_ops, cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(pv_ops, cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(pv_ops, cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(pv_ops, cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(pv_ops, cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(pv_ops, cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, pv_ops, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(pv_ops, cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(pv_ops, cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(pv_ops, cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(pv_ops, cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(pv_ops, cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(pv_ops, cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(pv_ops, cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(pv_ops, mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, pv_ops, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(pv_ops, mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(pv_ops, mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(pv_ops, mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(pv_ops, mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(pv_ops, mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(pv_ops, mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(pv_ops, mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(pv_ops, mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(pv_ops, mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, pv_ops, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, pv_ops, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, pv_ops, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, pv_ops, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, pv_ops, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(pv_ops, mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(pv_ops, mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(pv_ops, mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, pv_ops, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, pv_ops, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(pv_ops, mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, pv_ops, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, pv_ops, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(pv_ops, mmu.set_p4d, p4dp, val); } static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, pv_ops, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, pv_ops, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(pv_ops, mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(pv_ops, cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(pv_ops, cpu.end_context_switch, next); } static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(pv_ops, mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(pv_ops, mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(pv_ops, mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, pv_ops, irq.save_fl, "pushf; pop %%rax", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(pv_ops, irq.irq_disable, "cli", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(pv_ops, irq.irq_enable, "sti", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif #else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl", \ "ALT_CALL_INSTR", ALT_CALL_ALWAYS, \ "pushf; pop %rax", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLER__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLER__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PARAVIRT_H */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the IPPROTO_SMC (socket related) * * Copyright IBM Corp. 2016, 2018 * Copyright (c) 2024, Alibaba Inc. * * Author: D. Wythe <alibuda@linux.alibaba.com> */ #include <net/protocol.h> #include <net/sock.h> #include "smc_inet.h" #include "smc.h" static int smc_inet_init_sock(struct sock *sk); static struct proto smc_inet_prot = { .name = "INET_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; static const struct proto_ops smc_inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, }; #if IS_ENABLED(CONFIG_IPV6) struct smc6_sock { struct smc_sock smc; struct ipv6_pinfo inet6; }; static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, }; #endif /* CONFIG_IPV6 */ static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } int __init smc_inet_init(void) { int rc; rc = proto_register(&smc_inet_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet_prot fails with %d\n", __func__, rc); return rc; } /* no return value */ inet_register_protosw(&smc_inet_protosw); #if IS_ENABLED(CONFIG_IPV6) rc = proto_register(&smc_inet6_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet6_prot fails with %d\n", __func__, rc); goto out_inet6_prot; } rc = inet6_register_protosw(&smc_inet6_protosw); if (rc) { pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", __func__, rc); goto out_inet6_protosw; } return rc; out_inet6_protosw: proto_unregister(&smc_inet6_prot); out_inet6_prot: inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); #endif /* CONFIG_IPV6 */ return rc; } void smc_inet_exit(void) { #if IS_ENABLED(CONFIG_IPV6) inet6_unregister_protosw(&smc_inet6_protosw); proto_unregister(&smc_inet6_prot); #endif /* CONFIG_IPV6 */ inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); }
4 5 5 5 5 1 2 5 1 1 3 5 2 1 2 4 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include <net/can.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } if (ndev->reg_state != NETREG_REGISTERED) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); jsk->priv = NULL; synchronize_rcu(); j1939_priv_put(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; struct can_skb_ext *csx; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; csx = can_skb_ext_add(skb); if (!csx) { kfree_skb(skb); ret = -ENOMEM; goto failure; } csx->can_iif = ndev->ifindex; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) { struct sock *sk; struct j1939_sock *jsk; bool wait_rcu = false; rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { /* Skip if j1939_jsk_add() is not called on this socket. */ if (!(jsk->state & J1939_SOCK_BOUND)) continue; sk = &jsk->sk; sock_hold(sk); read_unlock_bh(&priv->j1939_socks_lock); /* Check if j1939_jsk_del() is not yet called on this socket after holding * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call * j1939_jsk_del() with socket's lock held. */ lock_sock(sk); if (jsk->state & J1939_SOCK_BOUND) { /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). * Make this socket no longer bound, by pretending as if j1939_sk_bind() * dropped old references but did not get new references. */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from * calling the corresponding j1939_priv_put(). * * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after * an RCU grace period. But since the caller is holding a ref on this * "priv", we can defer synchronize_rcu() until immediately before * the caller calls j1939_priv_put(). */ j1939_priv_put(priv); jsk->priv = NULL; wait_rcu = true; } release_sock(sk); sock_put(sk); goto rescan; } read_unlock_bh(&priv->j1939_socks_lock); if (wait_rcu) synchronize_rcu(); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
1305 847 1259 47 396 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; const char *name; int exit_signal; u32 kthread:1; u32 io_thread:1; u32 user_worker:1; u32 no_files:1; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; int idle; int (*fn)(void *); void *fn_arg; struct cgroup *cgrp; struct css_set *cset; unsigned int kill_seq; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); void __noreturn make_task_dead(int signr); extern void mm_cache_init(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *copy_process(struct pid *pid, int trace, int node, struct kernel_clone_args *args); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags); extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ extern void sched_exec(void); static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } static inline struct task_struct *tryget_task_struct(struct task_struct *t) { return refcount_inc_not_zero(&t->usage) ? t : NULL; } extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); static inline void put_task_struct(struct task_struct *t) { if (!refcount_dec_and_test(&t->usage)) return; /* * Under PREEMPT_RT, we can't call __put_task_struct * in atomic context because it will indirectly * acquire sleeping locks. The same is true if the * current process has a mutex enqueued (blocked on * a PI chain). * * In !RT, it is always safe to call __put_task_struct(). * Though, in order to simplify the code, resort to the * deferred call too. * * call_rcu() will schedule __put_task_struct_rcu_cb() * to be called in process context. * * __put_task_struct() is called when * refcount_dec_and_test(&t->usage) succeeds. * * This means that it can't "conflict" with * put_task_struct_rcu_user() which abuses ->rcu the same * way; rcu_users has a reference so task->usage can't be * zero after rcu_users 1 -> 0 transition. * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no * way it can conflict with __put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); /* Free all architecture-specific resources held by a thread. */ void release_thread(struct task_struct *dead_task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests inside of read_lock(&tasklist_lock). It must not be nested with * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) __acquires(&p->alloc_lock) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) __releases(&p->alloc_lock) { spin_unlock(&p->alloc_lock); } DEFINE_LOCK_GUARD_1(task_lock, struct task_struct, task_lock(_T->lock), task_unlock(_T->lock)) DECLARE_LOCK_GUARD_1_ATTRS(task_lock, __acquires(&_T->alloc_lock), __releases(&(*(struct task_struct **)_T)->alloc_lock)) #define class_task_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(task_lock, _T) #endif /* _LINUX_SCHED_TASK_H */
3843 2725 1158 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif
6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kvmalloc_objs(struct list_head, hash_size); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } } /* * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block_nonatomic(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* jbd2_journal_switch_revoke_table table select j_revoke for next * transaction we do not want to suspend any processing until all * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } }
21 36 2 35 2 32 22 21 1 10 32 32 30 10 10 7 7 3 3 4 1 1 1 1 85 4 14 6 6 1 3 13 5 9 9 9 9 9 9 7 9 9 9 9 9 8 36 9 9 9 9 8 9 9 9 9 9 8 2 2 1 5 5 5 5 9 7 8 8 1 2 8 8 32 24 8 11 11 11 1 3 21 21 9 1 5 1 27 8 33 33 31 31 3 1 1 1 10 8 5 28 11 2 1 17 4 3 1 1 1 3 5 2 2 5 12 12 3 20 4 11 4 2 12 14 12 4 27 27 29 29 41 41 14 14 1 5 1144 1144 381 101 163 163 14 9 9 9 1 163 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ /* Devmaps primary use is as a backend map for XDP BPF helper call * bpf_redirect_map(). Because XDP is mostly concerned with performance we * spent some effort to ensure the datapath with redirect maps does not use * any locking. This is a quick note on the details. * * We have three possible paths to get into the devmap control plane bpf * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall * will invoke an update, delete, or lookup operation. To ensure updates and * deletes appear atomic from the datapath side xchg() is used to modify the * netdev_map array. Then because the datapath does a lookup into the netdev_map * array (read-only) from an RCU critical section we use call_rcu() to wait for * an rcu grace period before free'ing the old data structures. This ensures the * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. * * Finally, any of the above may race with a netdev_unregister notifier. The * unregister notifier must search for net devices in the map structure that * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. * When removing the dev a cmpxchg() is used to ensure the correct dev is * removed, in the case of a concurrent update or delete operation it is * possible that the initially referenced dev is no longer in the map. As the * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. * * The devmap_hash type is a map type which interprets keys as ifindexes and * indexes these using a hashmap. This allows maps that use ifindex as key to be * densely packed instead of having holes in the lookup array for unused * ifindexes. The setup and packet enqueue/send code is shared between the two * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <linux/local_lock.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; local_lock_t bq_lock; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; spinlock_t index_lock; unsigned int items; u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct hlist_head *dev_map_create_hash(unsigned int entries, int numa_node) { int i; struct hlist_head *hash; hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, int idx) { return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } static int dev_map_alloc_check(union bpf_attr *attr) { u32 valsize = attr->value_size; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex * 8 bytes: ifindex + prog fd */ if (attr->max_entries == 0 || attr->key_size != 4 || (valsize != offsetofend(struct bpf_devmap_val, ifindex) && valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2; roundup_pow_of_two() * can overflow into UB on 32-bit arches */ if (attr->max_entries > 1UL << 31) return -EINVAL; } return 0; } static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2 */ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) return -ENOMEM; } return 0; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err; dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { bpf_map_area_free(dtab); return ERR_PTR(err); } spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; } static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. The following synchronize_rcu() guarantees * both rcu read critical sections complete and waits for * preempt-disable regions (NAPI being the relevant context here) so we * are certain there will be no further reads against the netdev_map and * all flush operations are complete. Flush operations can only be done * from NAPI context for this reason. */ spin_lock(&dev_map_lock); list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() * during NAPI callback and cleared after the XDP redirect. There is no * explicit RCU read section which protects bpf_redirect_info->map but * local_bh_disable() also marks the beginning an RCU section. This * makes the complete softirq callback RCU protected. Thus after * following synchronize_rcu() there no bpf_redirect_info->map == map * assignment. */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } bpf_map_area_free(dtab->netdev_map); } bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; return 0; } if (index == dtab->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; hlist_for_each_entry_rcu(dev, head, index_hlist, lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; return NULL; } static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 idx, *next = next_key; struct bpf_dtab_netdev *dev, *next_dev; struct hlist_head *head; int i = 0; if (!key) goto find_first; idx = *(u32 *)key; dev = __dev_map_hash_lookup_elem(map, idx); if (!dev) goto find_first; next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } i = idx & (dtab->n_buckets - 1); i++; find_first: for (; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } } return -ENOENT; } static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *tx_dev, struct net_device *rx_dev) { struct xdp_txq_info txq = { .dev = tx_dev }; struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (unlikely(err < 0)) xdp_return_frame_rx_napi(xdpf); else frames[nframes++] = xdpf; break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); break; } } return nframes; /* sent frames count */ } static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; int sent = 0, err = 0; int to_send = cnt; int i; lockdep_assert_held(&bq->bq_lock); if (unlikely(!cnt)) return; for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } if (bq->xdp_prog) { to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. */ err = sent; sent = 0; } /* If not all frames have been transmitted, it is our * responsibility to free them */ for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); out: bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ void __dev_flush(struct list_head *flush_list) { struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); } } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; obj = rcu_dereference_check(dtab->netdev_map[key], rcu_read_lock_bh_held()); return obj; } /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh() * to serialise access to the per-CPU bq. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct xdp_dev_bulk_queue *bq; local_lock_nested_bh(&dev->xdp_bulkq->bq_lock); bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. * * Do the same with xdp_prog and flush_list since these fields * are only ever modified together. */ if (!bq->dev_rx) { struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); } bq->q[bq->count++] = xdpf; local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { int err; if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return -EOPNOTSUPP; if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return -EOPNOTSUPP; err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) { struct xdp_txq_info txq = { .dev = dst->dev }; struct xdp_buff xdp; u32 act; if (!dst->xdp_prog) return XDP_PASS; __skb_pull(skb, skb->mac_len); xdp.txq = &txq; act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); switch (act) { case XDP_PASS: __skb_push(skb, skb->mac_len); break; default: bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); fallthrough; case XDP_DROP: kfree_skb(skb); break; } return act; } int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj) return false; if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return false; if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return false; if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; } static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, struct net_device *dev_rx, struct xdp_frame *xdpf) { struct xdp_frame *nxdpf; nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); return 0; } static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) { while (num_excluded--) { if (ifindex == excluded[num_excluded]) return true; } return false; } /* Get ifindex of each upper device. 'indexes' must be able to hold at * least 'max' elements. * Returns the number of ifindexes added, or -EOVERFLOW if there are too * many upper devices. */ static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (n >= max) return -EOVERFLOW; indexes[n++] = upper->ifindex; } return n; } int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev_rx, excluded_devices, ARRAY_SIZE(excluded_devices) - 1); if (num_excluded < 0) return num_excluded; excluded_devices[num_excluded++] = dev_rx->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } } /* consume the last copy of the frame */ if (last_dst) bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); else xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { int err; err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. */ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) return 0; skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; err = dev_map_generic_redirect(dst, nskb, xdp_prog); if (unlikely(err)) { consume_skb(nskb); return err; } return 0; } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev, excluded_devices, ARRAY_SIZE(excluded_devices) - 1); if (num_excluded < 0) return num_excluded; excluded_devices[num_excluded++] = dev->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } } /* consume the first skb and return */ if (last_dst) return dev_map_generic_redirect(last_dst, skb, xdp_prog); /* dtab is empty */ consume_skb(skb); return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } return 0; } static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, k); if (old_dev) { dtab->items--; hlist_del_init_rcu(&old_dev->index_hlist); call_rcu(&old_dev->rcu, __dev_map_entry_free); ret = 0; } spin_unlock_irqrestore(&dtab->index_lock, flags); return ret; } static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, struct bpf_devmap_val *val, unsigned int idx) { struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), GFP_NOWAIT, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; if (prog->expected_attach_type != BPF_XDP_DEVMAP || !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } dev->idx = idx; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; } else { dev->xdp_prog = NULL; dev->val.bpf_prog.id = 0; } dev->val.ifindex = val->ifindex; return dev; err_put_prog: bpf_prog_put(prog); err_put_dev: dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); } static long __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the * net device is removed. */ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); else atomic_inc((atomic_t *)&dtab->items); return 0; } static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); } else { if (dtab->items >= dtab->map.max_entries) { spin_unlock_irqrestore(&dtab->index_lock, flags); call_rcu(&dev->rcu, __dev_map_entry_free); return -E2BIG; } dtab->items++; } hlist_add_head_rcu(&dev->index_hlist, dev_map_index_hash(dtab, idx)); spin_unlock_irqrestore(&dtab->index_lock, flags); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; out_err: spin_unlock_irqrestore(&dtab->index_lock, flags); return err; } static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_hash_lookup_elem); } static u64 dev_map_mem_usage(const struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u64 usage = sizeof(struct bpf_dtab); if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); else usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); usage += atomic_read((atomic_t *)&dtab->items) * (u64)sizeof(struct bpf_dtab_netdev); return usage; } BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, struct net_device *netdev) { unsigned long flags; u32 i; spin_lock_irqsave(&dtab->index_lock, flags); for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { if (netdev != dev->dev) continue; dtab->items--; hlist_del_rcu(&dev->index_hlist); call_rcu(&dev->rcu, __dev_map_entry_free); } } spin_unlock_irqrestore(&dtab->index_lock, flags); } static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; int i, cpu; switch (event) { case NETDEV_REGISTER: if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) break; /* will be freed in free_netdev() */ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; for_each_possible_cpu(cpu) { struct xdp_dev_bulk_queue *bq; bq = per_cpu_ptr(netdev->xdp_bulkq, cpu); bq->dev = netdev; local_lock_init(&bq->bq_lock); } break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete * operation does not free a netdev_map entry while we * are comparing it against the netdev being unregistered. */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dev_map_hash_remove_netdev(dtab, netdev); continue; } for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } } } rcu_read_unlock(); break; default: break; } return NOTIFY_OK; } static struct notifier_block dev_map_notifier = { .notifier_call = dev_map_notification, }; static int __init dev_map_init(void) { /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } subsys_initcall(dev_map_init);
31 3812 2179 10 3811 12 3795 20 3761 3784 14 1114 229 2 26 38 66 66 4 29 35 4 66 3767 1332 2593 1288 61 1288 1279 4 10 3763 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. */ if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ { char name[64]; int name_len; const dev_t dev = sb->s_dev; name[sizeof(name) - 1] = '\0'; snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), MINOR(dev)); name_len = strlen(name); pos -= name_len; if (pos < buffer) goto out; memmove(pos, name, name_len); return pos; } /* Prepend filesystem name. */ prepend_filesystem_name: { const char *name = sb->s_type->name; const int name_len = strlen(name); pos -= name_len + 1; if (pos < buffer) goto out; memmove(pos, name, name_len); pos[name_len] = ':'; } return pos; out: return ERR_PTR(-ENOMEM); } /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * * @path: Pointer to "struct path". * * Returns the realpath of the given @path on success, NULL otherwise. * * If dentry is a directory, trailing '/' is appended. * Characters out of 0x20 < c < 0x7F range are converted to * \ooo style octal string. * Character \ is converted to \\ string. * * These functions use kzalloc(), so the caller must call kfree() * if these functions didn't return NULL. */ char *tomoyo_realpath_from_path(const struct path *path) { char *buf = NULL; char *name = NULL; unsigned int buf_len = PAGE_SIZE / 2; struct dentry *dentry = path->dentry; struct super_block *sb = dentry->d_sb; while (1) { char *pos; struct inode *inode; buf_len <<= 1; kfree(buf); buf = kmalloc(buf_len, GFP_NOFS); if (!buf) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation */ if ((!inode->i_op->rename && !(sb->s_type->fs_flags & FS_REQUIRES_DEV))) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ else { pos = tomoyo_get_absolute_path(path, buf, buf_len - 1); /* * Fall back to local name if absolute name is not * available. */ if (pos == ERR_PTR(-EINVAL)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); } encode: if (IS_ERR(pos)) continue; name = tomoyo_encode(pos); break; } kfree(buf); if (!name) tomoyo_warn_oom(__func__); return name; } /** * tomoyo_realpath_nofollow - Get realpath of a pathname. * * @pathname: The pathname to solve. * * Returns the realpath of @pathname on success, NULL otherwise. */ char *tomoyo_realpath_nofollow(const char *pathname) { struct path path; if (pathname && kern_path(pathname, 0, &path) == 0) { char *buf = tomoyo_realpath_from_path(&path); path_put(&path); return buf; } return NULL; }
6 6 6 6 7 6 2 2 2 2 2 8 8 8 18 18 18 9 6 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "originator.h" #include "main.h" #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "multicast.h" #include "netlink.h" #include "routing.h" #include "translation-table.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; /** * batadv_orig_hash_find() - Find and return originator from orig_hash * @bat_priv: the bat priv with all the mesh interface information * @data: mac address of the originator * * Return: orig_node (with increased refcnt), NULL on errors */ struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node, *orig_node_tmp = NULL; int index; if (!hash) return NULL; index = batadv_choose_orig(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (!batadv_compare_eth(orig_node, data)) continue; if (!kref_get_unless_zero(&orig_node->refcount)) continue; orig_node_tmp = orig_node; break; } rcu_read_unlock(); return orig_node_tmp; } static void batadv_purge_orig(struct work_struct *work); /** * batadv_compare_orig() - comparing function used in the originator hash table * @node: node in the local table * @data2: second object to compare the node to * * Return: true if they are the same originator */ bool batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, hash_entry); return batadv_compare_eth(data1, data2); } /** * batadv_orig_node_vlan_get() - get an orig_node_vlan object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * * Return: the vlan object identified by vid and belonging to orig_node or NULL * if it does not exist. */ struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_orig_node_vlan *vlan = NULL, *tmp; rcu_read_lock(); hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { if (tmp->vid != vid) continue; if (!kref_get_unless_zero(&tmp->refcount)) continue; vlan = tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding * @vid: the VLAN identifier * * Return: true when either no vlan is set or if VLAN is in correct range, * false otherwise */ static bool batadv_vlan_id_valid(unsigned short vid) { unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK); if (vid == 0) return true; if (!(vid & BATADV_VLAN_HAS_TAG)) return false; if (non_vlan) return false; return true; } /** * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan * object * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * * Return: NULL in case of failure or the vlan object identified by vid and * belonging to orig_node otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_orig_node_vlan *vlan; if (!batadv_vlan_id_valid(vid)) return NULL; spin_lock_bh(&orig_node->vlan_list_lock); /* first look if an object for this vid already exists */ vlan = batadv_orig_node_vlan_get(orig_node, vid); if (vlan) goto out; vlan = kzalloc_obj(*vlan, GFP_ATOMIC); if (!vlan) goto out; kref_init(&vlan->refcount); vlan->vid = vid; kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: spin_unlock_bh(&orig_node->vlan_list_lock); return vlan; } /** * batadv_orig_node_vlan_release() - release originator-vlan object from lists * and queue for free after rcu grace period * @ref: kref pointer of the originator-vlan object */ void batadv_orig_node_vlan_release(struct kref *ref) { struct batadv_orig_node_vlan *orig_vlan; orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount); kfree_rcu(orig_vlan, rcu); } /** * batadv_originator_init() - Initialize all originator structures * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ int batadv_originator_init(struct batadv_priv *bat_priv) { if (bat_priv->orig_hash) return 0; bat_priv->orig_hash = batadv_hash_new(1024); if (!bat_priv->orig_hash) goto err; batadv_hash_set_lock_class(bat_priv->orig_hash, &batadv_orig_hash_lock_class_key); INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); return 0; err: return -ENOMEM; } /** * batadv_neigh_ifinfo_release() - release neigh_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_ifinfo */ void batadv_neigh_ifinfo_release(struct kref *ref) { struct batadv_neigh_ifinfo *neigh_ifinfo; neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_put(neigh_ifinfo->if_outgoing); kfree_rcu(neigh_ifinfo, rcu); } /** * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period * @ref: kref pointer of the neigh_node */ void batadv_hardif_neigh_release(struct kref *ref) { struct batadv_hardif_neigh_node *hardif_neigh; hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, refcount); spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); batadv_hardif_put(hardif_neigh->if_incoming); kfree_rcu(hardif_neigh, rcu); } /** * batadv_neigh_node_release() - release neigh_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_node */ void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; neigh_node = container_of(ref, struct batadv_neigh_node, refcount); hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { batadv_neigh_ifinfo_put(neigh_ifinfo); } batadv_hardif_neigh_put(neigh_node->hardif_neigh); batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); } /** * batadv_orig_router_get() - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != if_outgoing) continue; router = rcu_dereference(orig_ifinfo->router); break; } if (router && !kref_get_unless_zero(&router->refcount)) router = NULL; rcu_read_unlock(); return router; } /** * batadv_orig_to_router() - get next hop neighbor to an orig address * @bat_priv: the bat priv with all the mesh interface information * @orig_addr: the originator MAC address to search the best next hop router for * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * * Return: A neighbor node which is the best router towards the given originator * address. */ struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; orig_node = batadv_orig_hash_find(bat_priv, orig_addr); if (!orig_node) return NULL; neigh_node = batadv_find_router(bat_priv, orig_node, if_outgoing); batadv_orig_node_put(orig_node); return neigh_node; } /** * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: the requested orig_ifinfo or NULL if not found. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *tmp, *orig_ifinfo = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp, &orig_node->ifinfo_list, list) { if (tmp->if_outgoing != if_outgoing) continue; if (!kref_get_unless_zero(&tmp->refcount)) continue; orig_ifinfo = tmp; break; } rcu_read_unlock(); return orig_ifinfo; } /** * batadv_orig_ifinfo_new() - search and possibly create an orig_ifinfo object * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing * interface otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; unsigned long reset_time; spin_lock_bh(&orig_node->neigh_list_lock); orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (orig_ifinfo) goto out; orig_ifinfo = kzalloc_obj(*orig_ifinfo, GFP_ATOMIC); if (!orig_ifinfo) goto out; if (if_outgoing != BATADV_IF_DEFAULT) kref_get(&if_outgoing->refcount); reset_time = jiffies - 1; reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_ifinfo->batman_seqno_reset = reset_time; orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); kref_init(&orig_ifinfo->refcount); kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); out: spin_unlock_bh(&orig_node->neigh_list_lock); return orig_ifinfo; } /** * batadv_neigh_ifinfo_get() - find the ifinfo from an neigh_node * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * The object is returned with refcounter increased by 1. * * Return: the requested neigh_ifinfo or NULL if not found */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL, *tmp_neigh_ifinfo; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_ifinfo, &neigh->ifinfo_list, list) { if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) continue; if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount)) continue; neigh_ifinfo = tmp_neigh_ifinfo; break; } rcu_read_unlock(); return neigh_ifinfo; } /** * batadv_neigh_ifinfo_new() - search and possibly create an neigh_ifinfo object * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * Return: NULL in case of failure or the neigh_ifinfo object for the * if_outgoing interface otherwise. The object is created and added to the list * if it does not exist. * * The object is returned with refcounter increased by 1. */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo; spin_lock_bh(&neigh->ifinfo_lock); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh, if_outgoing); if (neigh_ifinfo) goto out; neigh_ifinfo = kzalloc_obj(*neigh_ifinfo, GFP_ATOMIC); if (!neigh_ifinfo) goto out; if (if_outgoing) kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; kref_get(&neigh_ifinfo->refcount); hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); out: spin_unlock_bh(&neigh->ifinfo_lock); return neigh_ifinfo; } /** * batadv_neigh_node_get() - retrieve a neighbour from the list * @orig_node: originator which the neighbour belongs to * @hard_iface: the interface where this neighbour is connected to * @addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, const u8 *addr) { struct batadv_neigh_node *tmp_neigh_node, *res = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) continue; if (tmp_neigh_node->if_incoming != hard_iface) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; res = tmp_neigh_node; break; } rcu_read_unlock(); return res; } /** * batadv_hardif_neigh_create() - create a hardif neighbour node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * @orig_node: originator object representing the neighbour * * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hardif_neigh_node *hardif_neigh; spin_lock_bh(&hard_iface->neigh_list_lock); /* check if neighbor hasn't been added in the meantime */ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); if (hardif_neigh) goto out; hardif_neigh = kzalloc_obj(*hardif_neigh, GFP_ATOMIC); if (!hardif_neigh) goto out; kref_get(&hard_iface->refcount); INIT_HLIST_NODE(&hardif_neigh->list); ether_addr_copy(hardif_neigh->addr, neigh_addr); ether_addr_copy(hardif_neigh->orig, orig_node->orig); hardif_neigh->if_incoming = hard_iface; hardif_neigh->last_seen = jiffies; kref_init(&hardif_neigh->refcount); if (bat_priv->algo_ops->neigh.hardif_init) bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); hlist_add_head_rcu(&hardif_neigh->list, &hard_iface->neigh_list); out: spin_unlock_bh(&hard_iface->neigh_list_lock); return hardif_neigh; } /** * batadv_hardif_neigh_get_or_create() - retrieve or create a hardif neighbour * node * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * @orig_node: originator object representing the neighbour * * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node) { struct batadv_hardif_neigh_node *hardif_neigh; /* first check without locking to avoid the overhead */ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); if (hardif_neigh) return hardif_neigh; return batadv_hardif_neigh_create(hard_iface, neigh_addr, orig_node); } /** * batadv_hardif_neigh_get() - retrieve a hardif neighbour from the list * @hard_iface: the interface where this neighbour is connected to * @neigh_addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this hard interface. * * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_hardif_neigh_node *tmp_hardif_neigh, *hardif_neigh = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_hardif_neigh, &hard_iface->neigh_list, list) { if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) continue; if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount)) continue; hardif_neigh = tmp_hardif_neigh; break; } rcu_read_unlock(); return hardif_neigh; } /** * batadv_neigh_node_create() - create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. * * Return: the neighbour node if found or created or NULL otherwise. */ static struct batadv_neigh_node * batadv_neigh_node_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh = NULL; spin_lock_bh(&orig_node->neigh_list_lock); neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) goto out; hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface, neigh_addr, orig_node); if (!hardif_neigh) goto out; neigh_node = kzalloc_obj(*neigh_node, GFP_ATOMIC); if (!neigh_node) goto out; INIT_HLIST_NODE(&neigh_node->list); INIT_HLIST_HEAD(&neigh_node->ifinfo_list); spin_lock_init(&neigh_node->ifinfo_lock); kref_get(&hard_iface->refcount); ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; neigh_node->last_seen = jiffies; /* increment unique neighbor refcount */ kref_get(&hardif_neigh->refcount); neigh_node->hardif_neigh = hardif_neigh; /* extra reference for return */ kref_init(&neigh_node->refcount); kref_get(&neigh_node->refcount); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: spin_unlock_bh(&orig_node->neigh_list_lock); batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } /** * batadv_neigh_node_get_or_create() - retrieve or create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Return: the neighbour node if found or created or NULL otherwise. */ struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; /* first check without locking to avoid the overhead */ neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) return neigh_node; return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); } /** * batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a * specific outgoing interface * @msg: message to dump into * @cb: parameters for the dump * * Return: 0 or error value */ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if, *hard_iface; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } if (primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out_put_primary_if; } hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); goto out_put_primary_if; } else if (IS_ERR(hard_iface)) { /* => PTR_ERR(hard_iface) == -ENONET * => no hard-iface given, ok */ hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->neigh.dump) { ret = -EOPNOTSUPP; goto out_put_hard_iface; } bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; out_put_hard_iface: batadv_hardif_put(hard_iface); out_put_primary_if: batadv_hardif_put(primary_if); out_put_mesh_iface: dev_put(mesh_iface); return ret; } /** * batadv_orig_ifinfo_release() - release orig_ifinfo from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_ifinfo */ void batadv_orig_ifinfo_release(struct kref *ref) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) batadv_hardif_put(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** * batadv_orig_node_free_rcu() - free the orig_node * @rcu: rcu pointer of the orig_node */ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { struct batadv_orig_node *orig_node; orig_node = container_of(rcu, struct batadv_orig_node, rcu); batadv_mcast_purge_orig(orig_node); batadv_frag_purge_orig(orig_node, NULL); kfree(orig_node->tt_buff); kfree(orig_node); } /** * batadv_orig_node_release() - release orig_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the orig_node */ void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_orig_node_vlan *vlan; struct batadv_orig_ifinfo *last_candidate; orig_node = container_of(ref, struct batadv_orig_node, refcount); spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); batadv_neigh_node_put(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); } last_candidate = orig_node->last_bonding_candidate; orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(last_candidate); spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { hlist_del_rcu(&vlan->list); batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** * batadv_originator_free() - Free all originator structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; u32 i; if (!hash) return; cancel_delayed_work_sync(&bat_priv->orig_work); bat_priv->orig_hash = NULL; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { hlist_del_rcu(&orig_node->hash_entry); batadv_orig_node_put(orig_node); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); } /** * batadv_orig_node_new() - creates a new orig_node * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the originator * * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; struct batadv_orig_node_vlan *vlan; unsigned long reset_time; int i; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Creating new originator: %pM\n", addr); orig_node = kzalloc_obj(*orig_node, GFP_ATOMIC); if (!orig_node) return NULL; INIT_HLIST_HEAD(&orig_node->neigh_list); INIT_HLIST_HEAD(&orig_node->vlan_list); INIT_HLIST_HEAD(&orig_node->ifinfo_list); spin_lock_init(&orig_node->bcast_seqno_lock); spin_lock_init(&orig_node->neigh_list_lock); spin_lock_init(&orig_node->tt_buff_lock); spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); /* extra reference for return */ kref_init(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); batadv_dat_init_orig_node_addr(orig_node); atomic_set(&orig_node->last_ttvn, 0); orig_node->tt_buff = NULL; orig_node->tt_buff_len = 0; orig_node->last_seen = jiffies; reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_node->bcast_seqno_reset = reset_time; #ifdef CONFIG_BATMAN_ADV_MCAST orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; orig_node->mcast_flags |= BATADV_MCAST_HAVE_MC_PTYPE_CAPA; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); spin_lock_init(&orig_node->mcast_handler_lock); #endif /* create a vlan object for the "untagged" LAN */ vlan = batadv_orig_node_vlan_new(orig_node, BATADV_NO_FLAGS); if (!vlan) goto free_orig_node; /* batadv_orig_node_vlan_new() increases the refcounter. * Immediately release vlan since it is not needed anymore in this * context */ batadv_orig_node_vlan_put(vlan); for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { INIT_HLIST_HEAD(&orig_node->fragments[i].fragment_list); spin_lock_init(&orig_node->fragments[i].lock); orig_node->fragments[i].size = 0; } return orig_node; free_orig_node: kfree(orig_node); return NULL; } /** * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor * @bat_priv: the bat priv with all the mesh interface information * @neigh: orig node which is to be checked */ static void batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, struct batadv_neigh_node *neigh) { struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_hard_iface *if_outgoing; struct hlist_node *node_tmp; spin_lock_bh(&neigh->ifinfo_lock); /* for all ifinfo objects for this neighinator */ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh->ifinfo_list, list) { if_outgoing = neigh_ifinfo->if_outgoing; /* always keep the default interface */ if (if_outgoing == BATADV_IF_DEFAULT) continue; /* don't purge if the interface is not (going) down */ if (if_outgoing->if_status != BATADV_IF_INACTIVE && if_outgoing->if_status != BATADV_IF_NOT_IN_USE && if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor/ifinfo purge: neighbor %pM, iface: %s\n", neigh->addr, if_outgoing->net_dev->name); hlist_del_rcu(&neigh_ifinfo->list); batadv_neigh_ifinfo_put(neigh_ifinfo); } spin_unlock_bh(&neigh->ifinfo_lock); } /** * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any ifinfo entry was purged, false otherwise. */ static bool batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_hard_iface *if_outgoing; struct hlist_node *node_tmp; bool ifinfo_purged = false; spin_lock_bh(&orig_node->neigh_list_lock); /* for all ifinfo objects for this originator */ hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { if_outgoing = orig_ifinfo->if_outgoing; /* always keep the default interface */ if (if_outgoing == BATADV_IF_DEFAULT) continue; /* don't purge if the interface is not (going) down */ if (if_outgoing->if_status != BATADV_IF_INACTIVE && if_outgoing->if_status != BATADV_IF_NOT_IN_USE && if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "router/ifinfo purge: originator %pM, iface: %s\n", orig_node->orig, if_outgoing->net_dev->name); ifinfo_purged = true; hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); if (orig_node->last_bonding_candidate == orig_ifinfo) { orig_node->last_bonding_candidate = NULL; batadv_orig_ifinfo_put(orig_ifinfo); } } spin_unlock_bh(&orig_node->neigh_list_lock); return ifinfo_purged; } /** * batadv_purge_orig_neighbors() - purges neighbors from originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any neighbor was purged, false otherwise */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; bool neigh_purged = false; unsigned long last_seen; struct batadv_hard_iface *if_incoming; spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) || if_incoming->if_status == BATADV_IF_INACTIVE || if_incoming->if_status == BATADV_IF_NOT_IN_USE || if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) { if (if_incoming->if_status == BATADV_IF_INACTIVE || if_incoming->if_status == BATADV_IF_NOT_IN_USE || if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n", orig_node->orig, neigh_node->addr, if_incoming->net_dev->name); else batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor timeout: originator %pM, neighbor: %pM, last_seen: %u\n", orig_node->orig, neigh_node->addr, jiffies_to_msecs(last_seen)); neigh_purged = true; hlist_del_rcu(&neigh_node->list); batadv_neigh_node_put(neigh_node); } else { /* only necessary if not the whole neighbor is to be * deleted, but some interface has been removed. */ batadv_purge_neigh_ifinfo(bat_priv, neigh_node); } } spin_unlock_bh(&orig_node->neigh_list_lock); return neigh_purged; } /** * batadv_find_best_neighbor() - finds the best neighbor after purging * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * * Return: the current best neighbor, with refcount increased. */ static struct batadv_neigh_node * batadv_find_best_neighbor(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *best = NULL, *neigh; struct batadv_algo_ops *bao = bat_priv->algo_ops; rcu_read_lock(); hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) { if (best && (bao->neigh.cmp(neigh, if_outgoing, best, if_outgoing) <= 0)) continue; if (!kref_get_unless_zero(&neigh->refcount)) continue; batadv_neigh_node_put(best); best = neigh; } rcu_read_unlock(); return best; } /** * batadv_purge_orig_node() - purges obsolete information from an orig_node * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * This function checks if the orig_node or substructures of it have become * obsolete, and purges this information if that's the case. * * Return: true if the orig_node is to be removed, false otherwise. */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_neigh_node *best_neigh_node; struct batadv_hard_iface *hard_iface; bool changed_ifinfo, changed_neigh; struct list_head *iter; if (batadv_has_timed_out(orig_node->last_seen, 2 * BATADV_PURGE_TIMEOUT)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Originator timeout: originator %pM, last_seen %u\n", orig_node->orig, jiffies_to_msecs(orig_node->last_seen)); return true; } changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node); changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node); if (!changed_ifinfo && !changed_neigh) return false; /* first for NULL ... */ best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, BATADV_IF_DEFAULT); batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, hard_iface); batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); batadv_neigh_node_put(best_neigh_node); batadv_hardif_put(hard_iface); } rcu_read_unlock(); return false; } /** * batadv_purge_orig_ref() - Purge all outdated originators * @bat_priv: the bat priv with all the mesh interface information */ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; u32 i; if (!hash) return; /* for all origins... */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; if (hlist_empty(head)) continue; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { if (batadv_purge_orig_node(bat_priv, orig_node)) { batadv_gw_node_delete(bat_priv, orig_node); hlist_del_rcu(&orig_node->hash_entry); batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, -1, "originator timed out"); batadv_orig_node_put(orig_node); continue; } batadv_frag_purge_orig(orig_node, batadv_frag_check_entry); } spin_unlock_bh(list_lock); } batadv_gw_election(bat_priv); } static void batadv_purge_orig(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); batadv_purge_orig_ref(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } /** * batadv_orig_dump() - Dump to netlink the originator infos for a specific * outgoing interface * @msg: message to dump into * @cb: parameters for the dump * * Return: 0 or error value */ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if, *hard_iface; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } if (primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out_put_primary_if; } hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); goto out_put_primary_if; } else if (IS_ERR(hard_iface)) { /* => PTR_ERR(hard_iface) == -ENONET * => no hard-iface given, ok */ hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->orig.dump) { ret = -EOPNOTSUPP; goto out_put_hard_iface; } bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; out_put_hard_iface: batadv_hardif_put(hard_iface); out_put_primary_if: batadv_hardif_put(primary_if); out_put_mesh_iface: dev_put(mesh_iface); return ret; }
1 1 1 1 1 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linear symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers. * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "skcipher.h" static inline struct crypto_lskcipher *__crypto_lskcipher_cast( struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_lskcipher, base); } static inline struct lskcipher_alg *__crypto_lskcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct lskcipher_alg, co.base); } static int lskcipher_setkey_unaligned(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_lskcipher_alignmask(tfm); struct lskcipher_alg *cipher = crypto_lskcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_lskcipher_alignmask(tfm); struct lskcipher_alg *cipher = crypto_lskcipher_alg(tfm); if (keylen < cipher->co.min_keysize || keylen > cipher->co.max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return lskcipher_setkey_unaligned(tfm, key, keylen); else return cipher->setkey(tfm, key, keylen); } EXPORT_SYMBOL_GPL(crypto_lskcipher_setkey); static int crypto_lskcipher_crypt_unaligned( struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, int (*crypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, u32 flags)) { unsigned statesize = crypto_lskcipher_statesize(tfm); unsigned ivsize = crypto_lskcipher_ivsize(tfm); unsigned bs = crypto_lskcipher_blocksize(tfm); unsigned cs = crypto_lskcipher_chunksize(tfm); int err; u8 *tiv; u8 *p; BUILD_BUG_ON(MAX_CIPHER_BLOCKSIZE > PAGE_SIZE || MAX_CIPHER_ALIGNMASK >= PAGE_SIZE); tiv = kmalloc(PAGE_SIZE, GFP_ATOMIC); if (!tiv) return -ENOMEM; memcpy(tiv, iv, ivsize + statesize); p = kmalloc(PAGE_SIZE, GFP_ATOMIC); err = -ENOMEM; if (!p) goto out; while (len >= bs) { unsigned chunk = min((unsigned)PAGE_SIZE, len); int err; if (chunk > cs) chunk &= ~(cs - 1); memcpy(p, src, chunk); err = crypt(tfm, p, p, chunk, tiv, CRYPTO_LSKCIPHER_FLAG_FINAL); if (err) goto out; memcpy(dst, p, chunk); src += chunk; dst += chunk; len -= chunk; } err = len ? -EINVAL : 0; out: memcpy(iv, tiv, ivsize + statesize); kfree_sensitive(p); kfree_sensitive(tiv); return err; } static int crypto_lskcipher_crypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, int (*crypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, u32 flags)) { unsigned long alignmask = crypto_lskcipher_alignmask(tfm); if (((unsigned long)src | (unsigned long)dst | (unsigned long)iv) & alignmask) return crypto_lskcipher_crypt_unaligned(tfm, src, dst, len, iv, crypt); return crypt(tfm, src, dst, len, iv, CRYPTO_LSKCIPHER_FLAG_FINAL); } int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv) { struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm); return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->encrypt); } EXPORT_SYMBOL_GPL(crypto_lskcipher_encrypt); int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv) { struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm); return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->decrypt); } EXPORT_SYMBOL_GPL(crypto_lskcipher_decrypt); static int crypto_lskcipher_crypt_sg(struct skcipher_request *req, int (*crypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *ivs, u32 flags)) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher); u8 *ivs = skcipher_request_ctx(req); struct crypto_lskcipher *tfm = *ctx; struct skcipher_walk walk; unsigned ivsize; u32 flags; int err; ivsize = crypto_lskcipher_ivsize(tfm); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(skcipher) + 1); memcpy(ivs, req->iv, ivsize); flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP; if (req->base.flags & CRYPTO_SKCIPHER_REQ_CONT) flags |= CRYPTO_LSKCIPHER_FLAG_CONT; if (!(req->base.flags & CRYPTO_SKCIPHER_REQ_NOTFINAL)) flags |= CRYPTO_LSKCIPHER_FLAG_FINAL; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { err = crypt(tfm, walk.src.virt.addr, walk.dst.virt.addr, walk.nbytes, ivs, flags & ~(walk.nbytes == walk.total ? 0 : CRYPTO_LSKCIPHER_FLAG_FINAL)); err = skcipher_walk_done(&walk, err); flags |= CRYPTO_LSKCIPHER_FLAG_CONT; } memcpy(req->iv, ivs, ivsize); return err; } int crypto_lskcipher_encrypt_sg(struct skcipher_request *req) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher); struct lskcipher_alg *alg = crypto_lskcipher_alg(*ctx); return crypto_lskcipher_crypt_sg(req, alg->encrypt); } int crypto_lskcipher_decrypt_sg(struct skcipher_request *req) { struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher); struct lskcipher_alg *alg = crypto_lskcipher_alg(*ctx); return crypto_lskcipher_crypt_sg(req, alg->decrypt); } static void crypto_lskcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_lskcipher *skcipher = __crypto_lskcipher_cast(tfm); struct lskcipher_alg *alg = crypto_lskcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_lskcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_lskcipher *skcipher = __crypto_lskcipher_cast(tfm); struct lskcipher_alg *alg = crypto_lskcipher_alg(skcipher); if (alg->exit) skcipher->base.exit = crypto_lskcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static void crypto_lskcipher_free_instance(struct crypto_instance *inst) { struct lskcipher_instance *skcipher = container_of(inst, struct lskcipher_instance, s.base); skcipher->free(skcipher); } static void __maybe_unused crypto_lskcipher_show( struct seq_file *m, struct crypto_alg *alg) { struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg); seq_printf(m, "type : lskcipher\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->co.min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->co.max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->co.ivsize); seq_printf(m, "chunksize : %u\n", skcipher->co.chunksize); seq_printf(m, "statesize : %u\n", skcipher->co.statesize); } static int __maybe_unused crypto_lskcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "lskcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->co.min_keysize; rblkcipher.max_keysize = skcipher->co.max_keysize; rblkcipher.ivsize = skcipher->co.ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static const struct crypto_type crypto_lskcipher_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_lskcipher_init_tfm, .free = crypto_lskcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_lskcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_lskcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_LSKCIPHER, .tfmsize = offsetof(struct crypto_lskcipher, base), .algsize = offsetof(struct lskcipher_alg, co.base), }; static void crypto_lskcipher_exit_tfm_sg(struct crypto_tfm *tfm) { struct crypto_lskcipher **ctx = crypto_tfm_ctx(tfm); crypto_free_lskcipher(*ctx); } int crypto_init_lskcipher_ops_sg(struct crypto_tfm *tfm) { struct crypto_lskcipher **ctx = crypto_tfm_ctx(tfm); struct crypto_alg *calg = tfm->__crt_alg; struct crypto_lskcipher *skcipher; if (!crypto_mod_get(calg)) return -EAGAIN; skcipher = crypto_create_tfm(calg, &crypto_lskcipher_type); if (IS_ERR(skcipher)) { crypto_mod_put(calg); return PTR_ERR(skcipher); } *ctx = skcipher; tfm->exit = crypto_lskcipher_exit_tfm_sg; return 0; } int crypto_grab_lskcipher(struct crypto_lskcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_lskcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_lskcipher); struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_lskcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_lskcipher); static int lskcipher_prepare_alg(struct lskcipher_alg *alg) { struct crypto_alg *base = &alg->co.base; int err; err = skcipher_prepare_alg_common(&alg->co); if (err) return err; if (alg->co.chunksize & (alg->co.chunksize - 1)) return -EINVAL; base->cra_type = &crypto_lskcipher_type; base->cra_flags |= CRYPTO_ALG_TYPE_LSKCIPHER; return 0; } int crypto_register_lskcipher(struct lskcipher_alg *alg) { struct crypto_alg *base = &alg->co.base; int err; err = lskcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_lskcipher); void crypto_unregister_lskcipher(struct lskcipher_alg *alg) { crypto_unregister_alg(&alg->co.base); } EXPORT_SYMBOL_GPL(crypto_unregister_lskcipher); int crypto_register_lskciphers(struct lskcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_lskcipher(&algs[i]); if (ret) { crypto_unregister_lskciphers(algs, i); return ret; } } return 0; } EXPORT_SYMBOL_GPL(crypto_register_lskciphers); void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_lskcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_lskciphers); int lskcipher_register_instance(struct crypto_template *tmpl, struct lskcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = lskcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, lskcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(lskcipher_register_instance); static int lskcipher_setkey_simple(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_lskcipher *cipher = lskcipher_cipher_simple(tfm); crypto_lskcipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_lskcipher_set_flags(cipher, crypto_lskcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_lskcipher_setkey(cipher, key, keylen); } static int lskcipher_init_tfm_simple(struct crypto_lskcipher *tfm) { struct lskcipher_instance *inst = lskcipher_alg_instance(tfm); struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm); struct crypto_lskcipher_spawn *spawn; struct crypto_lskcipher *cipher; spawn = lskcipher_instance_ctx(inst); cipher = crypto_spawn_lskcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); *ctx = cipher; return 0; } static void lskcipher_exit_tfm_simple(struct crypto_lskcipher *tfm) { struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm); crypto_free_lskcipher(*ctx); } static void lskcipher_free_instance_simple(struct lskcipher_instance *inst) { crypto_drop_lskcipher(lskcipher_instance_ctx(inst)); kfree(inst); } /** * lskcipher_alloc_instance_simple - allocate instance of simple block cipher * * Allocate an lskcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to * struct crypto_lskcipher *, and default ->setkey(), ->init(), and * ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct lskcipher_instance *lskcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct lskcipher_instance *inst; struct crypto_lskcipher_spawn *spawn; char ecb_name[CRYPTO_MAX_ALG_NAME]; struct lskcipher_alg *cipher_alg; const char *cipher_name; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_LSKCIPHER, &mask); if (err) return ERR_PTR(err); cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return ERR_CAST(cipher_name); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = lskcipher_instance_ctx(inst); err = crypto_grab_lskcipher(spawn, lskcipher_crypto_instance(inst), cipher_name, 0, mask); ecb_name[0] = 0; if (err == -ENOENT && !!memcmp(tmpl->name, "ecb", 4)) { err = -ENAMETOOLONG; if (snprintf(ecb_name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; err = crypto_grab_lskcipher(spawn, lskcipher_crypto_instance(inst), ecb_name, 0, mask); } if (err) goto err_free_inst; cipher_alg = crypto_lskcipher_spawn_alg(spawn); err = crypto_inst_setname(lskcipher_crypto_instance(inst), tmpl->name, &cipher_alg->co.base); if (err) goto err_free_inst; if (ecb_name[0]) { int len; err = -EINVAL; len = strscpy(ecb_name, &cipher_alg->co.base.cra_name[4], sizeof(ecb_name)); if (len < 2) goto err_free_inst; if (ecb_name[len - 1] != ')') goto err_free_inst; ecb_name[len - 1] = 0; err = -ENAMETOOLONG; if (snprintf(inst->alg.co.base.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, ecb_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (strcmp(ecb_name, cipher_name) && snprintf(inst->alg.co.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, cipher_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; } else { /* Don't allow nesting. */ err = -ELOOP; if ((cipher_alg->co.base.cra_flags & CRYPTO_ALG_INSTANCE)) goto err_free_inst; } err = -EINVAL; if (cipher_alg->co.ivsize) goto err_free_inst; inst->free = lskcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.co.base.cra_blocksize = cipher_alg->co.base.cra_blocksize; inst->alg.co.base.cra_alignmask = cipher_alg->co.base.cra_alignmask; inst->alg.co.base.cra_priority = cipher_alg->co.base.cra_priority; inst->alg.co.min_keysize = cipher_alg->co.min_keysize; inst->alg.co.max_keysize = cipher_alg->co.max_keysize; inst->alg.co.ivsize = cipher_alg->co.base.cra_blocksize; inst->alg.co.statesize = cipher_alg->co.statesize; /* Use struct crypto_lskcipher * by default, can be overridden */ inst->alg.co.base.cra_ctxsize = sizeof(struct crypto_lskcipher *); inst->alg.setkey = lskcipher_setkey_simple; inst->alg.init = lskcipher_init_tfm_simple; inst->alg.exit = lskcipher_exit_tfm_simple; return inst; err_free_inst: lskcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(lskcipher_alloc_instance_simple);
1141 1139 5 2 4 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> */ #include <linux/if_arp.h> #include <linux/module.h> #include <net/6lowpan.h> #include <net/addrconf.h> #include "6lowpan_i.h" int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { int i, ret; switch (lltype) { case LOWPAN_LLTYPE_IEEE802154: dev->addr_len = EUI64_ADDR_LEN; break; case LOWPAN_LLTYPE_BTLE: dev->addr_len = ETH_ALEN; break; } dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; lowpan_dev(dev)->lltype = lltype; spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; dev->ndisc_ops = &lowpan_ndisc_ops; ret = register_netdevice(dev); if (ret < 0) return ret; lowpan_dev_debugfs_init(dev); return ret; } EXPORT_SYMBOL(lowpan_register_netdevice); int lowpan_register_netdev(struct net_device *dev, enum lowpan_lltypes lltype) { int ret; rtnl_lock(); ret = lowpan_register_netdevice(dev, lltype); rtnl_unlock(); return ret; } EXPORT_SYMBOL(lowpan_register_netdev); void lowpan_unregister_netdevice(struct net_device *dev) { unregister_netdevice(dev); lowpan_dev_debugfs_exit(dev); } EXPORT_SYMBOL(lowpan_unregister_netdevice); void lowpan_unregister_netdev(struct net_device *dev) { rtnl_lock(); lowpan_unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(lowpan_unregister_netdev); int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; /* Set short_addr autoconfiguration if short_addr is present only */ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) return -1; /* For either address format, all zero addresses MUST NOT be used */ if (wpan_dev->pan_id == cpu_to_le16(0x0000) && wpan_dev->short_addr == cpu_to_le16(0x0000)) return -1; /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) memset(eui, 0, 2); else ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); /* The "Universal/Local" (U/L) bit shall be set to zero */ eui[0] &= ~2; eui[2] = 0; eui[3] = 0xFF; eui[4] = 0xFE; eui[5] = 0; ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); return 0; } static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev; struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; idev = __in6_dev_get(dev); if (!idev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: case NETDEV_CHANGE: /* (802.15.4 6LoWPAN short address slaac handling */ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { __ipv6_addr_set_half(&addr.s6_addr32[0], htonl(0xFE800000), 0); addrconf_add_linklocal(idev, &addr, 0); } break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_notifier = { .notifier_call = lowpan_event, }; static int __init lowpan_module_init(void) { int ret; lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { lowpan_debugfs_exit(); return ret; } request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); request_module_nowait("nhc_ipv6"); request_module_nowait("nhc_mobility"); request_module_nowait("nhc_routing"); request_module_nowait("nhc_udp"); return 0; } static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); module_exit(lowpan_module_exit); MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module"); MODULE_LICENSE("GPL");
13 13 9 4 10 4 9 2 7 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_policy.c: based on xfrm4_policy.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl6.flowi6_mark = params->mark; memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); fl6.flowi4_proto = params->ipproto; fl6.uli = params->uli; dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { dst_release(dst); dst = ERR_PTR(err); } return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; int err; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; idev = ip6_dst_idev(dst); if (!idev) { dst_release(dst); return -EHOSTUNREACH; } dev = idev->dev; err = ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0, &saddr->in6); dst_release(dst); if (err) return -EHOSTUNREACH; return 0; } static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); if (!xdst->u.rt6.rt6i_idev) { netdev_put(dev, &xdst->u.dst.dev_tracker); return -ENODEV; } /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; rt6_uncached_list_add(&xdst->u.rt6); return 0; } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); } xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) { return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm6_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.xfrm6_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm6_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, sizeof(xfrm6_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); if (ret) return ret; ret = xfrm6_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); return ret; } static void __net_exit xfrm6_net_exit(struct net *net) { xfrm6_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); } static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; int __init xfrm6_init(void) { int ret; ret = xfrm6_policy_init(); if (ret) goto out; ret = xfrm6_state_init(); if (ret) goto out_policy; ret = xfrm6_protocol_init(); if (ret) goto out_state; ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; ret = xfrm_nat_keepalive_init(AF_INET6); if (ret) goto out_nat_keepalive; out: return ret; out_nat_keepalive: unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; } void xfrm6_fini(void) { xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); }
4052 4059 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 // SPDX-License-Identifier: GPL-2.0 /* * Detect hard and soft lockups on a system * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * * Note: Most of this code is borrowed heavily from the original softlockup * detector, so thanks to Ingo for the initial implementation. * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ #define pr_fmt(fmt) "watchdog: " fmt #include <linux/cpu.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/kvm_para.h> #include <linux/math64.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> #include <linux/sys_info.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <asm/irq_regs.h> static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) # define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else # define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif #define NUM_SAMPLE_PERIODS 5 unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR # ifdef CONFIG_SMP int __read_mostly sysctl_hardlockup_all_cpu_backtrace; # endif /* CONFIG_SMP */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * bitmasks to control what kinds of system info to be printed when * hard lockup is detected, it could be task, memory, lock etc. * Refer include/linux/sys_info.h for detailed bit definition. */ unsigned long hardlockup_si_mask; #ifdef CONFIG_SYSFS static unsigned int hardlockup_count; static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%u\n", hardlockup_count); } static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); static __init int kernel_hardlockup_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); return 0; } late_initcall(kernel_hardlockup_sysfs_init); #endif // CONFIG_SYSFS /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these * cases this function can be called to disable hard lockup detection. This * function should only be executed once by the boot processor before the * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ void __init hardlockup_detector_disable(void) { watchdog_hardlockup_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) { next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; else if (!strncmp(str, "r", 1)) hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; goto next; } } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ #if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have * preemption enabled. If preemption is enabled * then interrupts should be enabled too, in which * case we shouldn't have to worry about the watchdog * going off. */ raw_cpu_write(watchdog_hardlockup_touched, true); } EXPORT_SYMBOL(arch_touch_nmi_watchdog); void watchdog_hardlockup_touch_cpu(unsigned int cpu) { per_cpu(watchdog_hardlockup_touched, cpu) = true; } static bool is_hardlockup(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) return true; /* * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is * written/read by a single CPU. */ per_cpu(hrtimer_interrupts_saved, cpu) = hrint; return false; } static void watchdog_hardlockup_kick(void) { int new_interrupts; new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); watchdog_buddy_check_hardlockup(new_interrupts); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { int hardlockup_all_cpu_backtrace; if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? 1 : sysctl_hardlockup_all_cpu_backtrace; /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); unsigned long flags; #ifdef CONFIG_SYSFS ++hardlockup_count; #endif /* * A poorly behaving BPF scheduler can trigger hard lockup by * e.g. putting numerous affinitized tasks in a single queue and * directing all CPUs at it. The following call can return true * only once when sched_ext is enabled and will immediately * abort the BPF scheduler and print out a warning message. */ if (scx_hardlockup(cpu)) return; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; /* * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } /* * NOTE: we call printk_cpu_sync_get_irqsave() after printing * the lockup message. While it would be nice to serialize * that printout, we really want to make sure that if some * other CPU somehow locked up while holding the lock associated * with printk_cpu_sync_get_irqsave() that we can still at least * get the message about the lockup out. */ pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); printk_cpu_sync_get_irqsave(flags); print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); } else { printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } if (hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); per_cpu(watchdog_hardlockup_warned, cpu) = true; } else { per_cpu(watchdog_hardlockup_warned, cpu) = false; } } #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ static inline void watchdog_hardlockup_kick(void) { } #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ /* * These functions can be overridden based on the configured hardlockdup detector. * * watchdog_hardlockup_enable/disable can be implemented to start and stop when * softlockup watchdog start and stop. The detector must select the * SOFTLOCKUP_DETECTOR Kconfig. */ void __weak watchdog_hardlockup_enable(unsigned int cpu) { } void __weak watchdog_hardlockup_disable(unsigned int cpu) { } /* * Watchdog-detector specific API. * * Return 0 when hardlockup watchdog is available, negative value otherwise. * Note that the negative value means that a delayed probe might * succeed later. */ int __weak __init watchdog_hardlockup_probe(void) { return -ENODEV; } /** * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration * * The reconfiguration steps are: * watchdog_hardlockup_stop(); * update_variables(); * watchdog_hardlockup_start(); */ void __weak watchdog_hardlockup_stop(void) { } /** * watchdog_hardlockup_start - Start the watchdog after reconfiguration * * Counterpart to watchdog_hardlockup_stop(). * * The following variables have been updated in update_variables() and * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask */ void __weak watchdog_hardlockup_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit * * Caller needs to make sure that the hard watchdogs are off, so this * can't race with watchdog_hardlockup_disable(). */ static void lockup_detector_update_enable(void) { watchdog_enabled = 0; if (!watchdog_user_enabled) return; if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; if (watchdog_softlockup_user_enabled) watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * Delay the soflockup report when running a known slow code. * It does _not_ affect the timestamp of the last successdul reschedule. */ #define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif /* * bitmasks to control what kinds of system info to be printed when * soft lockup is detected, it could be task, memory, lock etc. * Refer include/linux/sys_info.h for detailed bit definition. */ static unsigned long softlockup_si_mask; static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; #ifdef CONFIG_SYSFS static unsigned int softlockup_count; static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%u\n", softlockup_count); } static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); static __init int kernel_softlockup_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); return 0; } late_initcall(kernel_softlockup_sysfs_init); #endif // CONFIG_SYSFS /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static unsigned long soft_lockup_nmi_warn; static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { watchdog_softlockup_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); static int __init watchdog_thresh_setup(char *str) { get_option(&str, &watchdog_thresh); return 1; } __setup("watchdog_thresh=", watchdog_thresh_setup); #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, STATS_SOFTIRQ, STATS_HARDIRQ, STATS_IDLE, NUM_STATS_PER_GROUP, }; static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = { CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE, }; static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_tail); /* * We don't need nanosecond resolution. A granularity of 16ms is * sufficient for our precision, allowing us to use u16 to store * cpustats, which will roll over roughly every ~1000 seconds. * 2^24 ~= 16 * 10^6 */ static u16 get_16bit_precision(u64 data_ns) { /* * 2^24ns ~= 16.8ms * Round to the nearest multiple of 16.8 milliseconds. */ return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) { int i; u8 util; u16 old_stat, new_stat; struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u8 tail = __this_cpu_read(cpustat_tail); u16 sample_period_16 = get_16bit_precision(sample_period); kcpustat_cpu_fetch(&kcpustat, smp_processor_id()); for (i = 0; i < NUM_STATS_PER_GROUP; i++) { old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); /* * Since we use 16-bit precision, the raw data will undergo * integer division, which may sometimes result in data loss, * and then result might exceed 100%. To avoid confusion, * we enforce a 100% display cap when calculations exceed this threshold. */ if (util > 100) util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS); } static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); u64 sample_period_msecond = sample_period; do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t" "%3u%% hardirq,\t%3u%% idle\n", i + 1, __this_cpu_read(cpustat_util[group][STATS_SYSTEM]), __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]), __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]), __this_cpu_read(cpustat_util[group][STATS_IDLE])); } } #define HARDIRQ_PERCENT_THRESH 50 #define NUM_HARDIRQ_REPORT 5 struct irq_counts { int irq; u32 counts; }; static DEFINE_PER_CPU(bool, snapshot_taken); /* Tabulate the most frequent interrupts. */ static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank) { int i; struct irq_counts new_count = {irq, counts}; for (i = 0; i < rank; i++) { if (counts > irq_counts[i].counts) swap(new_count, irq_counts[i]); } } /* * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period, * then the cause of softlockup might be interrupt storm. In this case, it * would be useful to start interrupt counting. */ static bool need_counting_irqs(void) { u8 util; int tail = __this_cpu_read(cpustat_tail); tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } static void start_counting_irqs(void) { if (!__this_cpu_read(snapshot_taken)) { kstat_snapshot_irqs(); __this_cpu_write(snapshot_taken, true); } } static void stop_counting_irqs(void) { __this_cpu_write(snapshot_taken, false); } static void print_irq_counts(void) { unsigned int i, count; struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = { {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0} }; if (__this_cpu_read(snapshot_taken)) { for_each_active_irq(i) { count = kstat_get_irq_since_snapshot(i); tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT); } /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n", smp_processor_id(), HARDIRQ_PERCENT_THRESH); for (i = 0; i < NUM_HARDIRQ_REPORT; i++) { if (irq_counts_sorted[i].irq == -1) break; printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n", i + 1, irq_counts_sorted[i].counts, irq_counts_sorted[i].irq); } /* * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last * sample_period, then we suspect the interrupt storm might be subsiding. */ if (!need_counting_irqs()) stop_counting_irqs(); } } static void report_cpu_status(void) { print_cpustat(); print_irq_counts(); } #else static inline void update_cpustat(void) { } static inline void report_cpu_status(void) { } static inline bool need_counting_irqs(void) { return false; } static inline void start_counting_irqs(void) { } static inline void stop_counting_irqs(void) { } #endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally * want a higher threshold for soft lockups than for hard lockups. So we couple * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ static int get_softlockup_thresh(void) { return watchdog_thresh * 2; } /* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ static unsigned long get_timestamp(void) { return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer several chances (two * or three with the current relation between the soft * and hard thresholds) to increment before the * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS); watchdog_update_hrtimer_threshold(sample_period); } static void update_report_ts(void) { __this_cpu_write(watchdog_report_ts, get_timestamp()); } /* Commands for resetting the watchdog */ static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); update_report_ts(); } /** * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls * * Call when the scheduler may have stalled for legitimate reasons * preventing the watchdog task from executing - e.g. the scheduler * entering idle state. This should only be used for scheduler events. * Use touch_softlockup_watchdog() for everything else. */ notrace void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's watchdog * report period gets restarted here, so use the raw_ operation. */ raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) { touch_softlockup_watchdog_sched(); wq_watchdog_touch(raw_smp_processor_id()); } EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) { int cpu; /* * watchdog_mutex cannpt be taken here, as this might be called * from (soft)interrupt context, so the access to * watchdog_allowed_cpumask might race with a concurrent update. * * The watchdog time stamp can race against a concurrent real * update as well, the only side effect might be a cycle delay for * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } static int is_softlockup(unsigned long touch_ts, unsigned long period_ts, unsigned long now) { if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { /* * If period_ts has not been updated during a sample_period, then * in the subsequent few sample_periods, period_ts might also not * be updated, which could indicate a potential softlockup. In * this case, if we suspect the cause of the potential softlockup * might be interrupt storm, then we need to count the interrupts * to find which interrupt is storming. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) && need_counting_irqs()) start_counting_irqs(); /* * A poorly behaving BPF scheduler can live-lock the system into * soft lockups. Tell sched_ext to try ejecting the BPF * scheduler when close to a soft lockup. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) scx_softlockup(now - touch_ts); /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; } /* watchdog detector functions */ static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); /* * The watchdog feed function - touches the timestamp. * * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). */ static int softlockup_fn(void *data) { update_touch_ts(); stop_counting_irqs(); complete(this_cpu_ptr(&softlockup_completion)); return 0; } /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int softlockup_all_cpu_backtrace; int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; /* * pass the buddy check if a panic is in process */ if (panic_in_progress()) return HRTIMER_NORESTART; softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? 1 : sysctl_softlockup_all_cpu_backtrace; watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { reinit_completion(this_cpu_ptr(&softlockup_completion)); stop_one_cpu_nowait(smp_processor_id(), softlockup_fn, NULL, this_cpu_ptr(&softlockup_stop_work)); } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); /* * Read the current timestamp first. It might become invalid anytime * when a virtual machine is stopped by the host or when the watchog * is touched from NMI. */ now = get_timestamp(); /* * If a virtual machine is stopped by the host it can look to * the watchdog like a soft lockup. This function touches the watchdog. */ kvm_check_and_clear_guest_paused(); /* * The stored timestamp is comparable with @now only when not touched. * It might get touched anytime from NMI. Make sure that is_softlockup() * uses the same (valid) value. */ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); update_cpustat(); /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } update_report_ts(); return HRTIMER_RESTART; } /* Check for a softlockup. */ touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { #ifdef CONFIG_SYSFS ++softlockup_count; #endif /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (softlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; } /* Start period for the next softlockup warning. */ update_report_ts(); printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); report_cpu_status(); print_modules(); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); if (!softlockup_panic) clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); thresh_count = duration / get_softlockup_thresh(); if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } return HRTIMER_RESTART; } static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); /* * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ update_touch_ts(); /* Enable the hardlockup detector */ if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) watchdog_hardlockup_enable(cpu); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); WARN_ON_ONCE(cpu != smp_processor_id()); /* * Disable the hardlockup detector first. That prevents that a large * delay between disabling the timer and disabling the hardlockup * detector causes a false positive. */ watchdog_hardlockup_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) { watchdog_disable(smp_processor_id()); return 0; } static void softlockup_stop_all(void) { int cpu; if (!softlockup_initialized) return; for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); cpumask_clear(&watchdog_allowed_mask); } static int softlockup_start_fn(void *data) { watchdog_enable(smp_processor_id()); return 0; } static void softlockup_start_all(void) { int cpu; cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } int lockup_detector_online_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_enable(cpu); return 0; } int lockup_detector_offline_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_disable(cpu); return 0; } static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); /* * To prevent watchdog_timer_fn from using the old interval and * the new watchdog_thresh at the same time, which could lead to * false softlockup reports, it is necessary to update the * watchdog_thresh after the softlockup is completed. */ if (thresh_changed) watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_start_all(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } /* * Create the watchdog infrastructure and configure the detector(s). */ static __init void lockup_detector_setup(void) { /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ lockup_detector_update_enable(); if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); if (thresh_changed) watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * * Special interface for parisc. It prevents lockup detector warnings from * the default pm_poweroff() function which busy loops forever. */ void lockup_detector_soft_poweroff(void) { watchdog_enabled = 0; } #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); __lockup_detector_reconfigure(thresh_changed); } /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * * caller | table->data points to | 'which' * -------------------|----------------------------------|------------------------------- * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | * | | WATCHDOG_SOFTOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED */ static int proc_watchdog_common(int which, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; mutex_lock(&watchdog_mutex); old = *param; if (!write) { /* * On read synchronize the userspace interface. This is a * racy snapshot. */ *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); *param = old; } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; } /* * /proc/sys/kernel/watchdog */ static int proc_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } /* * /proc/sys/kernel/nmi_watchdog */ static int proc_nmi_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!watchdog_hardlockup_available && write) return -ENOTSUPP; return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * /proc/sys/kernel/soft_watchdog */ static int proc_soft_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #endif /* * /proc/sys/kernel/watchdog_thresh */ static int proc_watchdog_thresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old; mutex_lock(&watchdog_mutex); watchdog_thresh_next = READ_ONCE(watchdog_thresh); old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && write && old != READ_ONCE(watchdog_thresh_next)) proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; } /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err; mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; } static const int sixty = 60; static const struct ctl_table watchdog_sysctls[] = { { .procname = "watchdog", .data = &watchdog_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&sixty, }, { .procname = "watchdog_cpumask", .data = &watchdog_cpumask_bits, .maxlen = NR_CPUS, .mode = 0644, .proc_handler = proc_watchdog_cpumask, }, #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &watchdog_softlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", .data = &softlockup_si_mask, .maxlen = sizeof(softlockup_si_mask), .mode = 0644, .proc_handler = sysctl_sys_info_handler, }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR { .procname = "hardlockup_panic", .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "hardlockup_sys_info", .data = &hardlockup_si_mask, .maxlen = sizeof(hardlockup_si_mask), .mode = 0644, .proc_handler = sysctl_sys_info_handler, }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif { .procname = "nmi_watchdog", .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_nmi_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); } #else #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static void __init lockup_detector_delay_init(struct work_struct *work); static bool allow_lockup_detector_init_retry __initdata; static struct work_struct detector_work __initdata = __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); static void __init lockup_detector_delay_init(struct work_struct *work) { int ret; ret = watchdog_hardlockup_probe(); if (ret) { if (ret == -ENODEV) pr_info("NMI not fully supported\n"); else pr_info("Delayed init of the lockup detector failed: %d\n", ret); pr_info("Hard watchdog permanently disabled\n"); return; } allow_lockup_detector_init_retry = false; watchdog_hardlockup_available = true; lockup_detector_setup(); } /* * lockup_detector_retry_init - retry init lockup detector if possible. * * Retry hardlockup detector init. It is useful when it requires some * functionality that has to be initialized later on a particular * platform. */ void __init lockup_detector_retry_init(void) { /* Must be called before late init calls */ if (!allow_lockup_detector_init_retry) return; schedule_work(&detector_work); } /* * Ensure that optional delayed hardlockup init is proceed before * the init code and memory is freed. */ static int __init lockup_detector_check(void) { /* Prevent any later retry. */ allow_lockup_detector_init_retry = false; /* Make sure no work is pending. */ flush_work(&detector_work); watchdog_sysctl_init(); return 0; } late_initcall_sync(lockup_detector_check); void __init lockup_detector_init(void) { if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_hardlockup_probe()) watchdog_hardlockup_available = true; else allow_lockup_detector_init_retry = true; lockup_detector_setup(); }
9 9 1 2 2 4 2 2 1141 1139 161 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_objs(struct dcb_app, app_count); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc_obj(*entry, GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_GETDCB, .doit = dcb_doit}, {.msgtype = RTM_SETDCB, .doit = dcb_doit}, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } device_initcall(dcbnl_init);
16 16 16 16 16 16 137 137 137 137 137 137 16 23 8 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 // SPDX-License-Identifier: GPL-2.0-or-later /* * VMA-specific functions. */ #include "vma_internal.h" #include "vma.h" struct mmap_state { struct mm_struct *mm; struct vma_iterator *vmi; unsigned long addr; unsigned long end; pgoff_t pgoff; unsigned long pglen; union { vm_flags_t vm_flags; vma_flags_t vma_flags; }; struct file *file; pgprot_t page_prot; /* User-defined fields, perhaps updated by .mmap_prepare(). */ const struct vm_operations_struct *vm_ops; void *vm_private_data; unsigned long charged; struct vm_area_struct *prev; struct vm_area_struct *next; /* Unmapping state. */ struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ bool check_ksm_early :1; /* If we map new, hold the file rmap lock on mapping. */ bool hold_file_rmap_lock :1; /* If .mmap_prepare changed the file, we don't need to pin. */ bool file_doesnt_need_get :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .vm_flags = vm_flags_, \ .file = file_, \ .page_prot = vm_get_page_prot(vm_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ struct vma_merge_struct name = { \ .mm = (map_)->mm, \ .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ .vm_flags = (map_)->vm_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ .middle = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ } /* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */ static bool vma_is_fork_child(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by the anon_vma root * lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) return false; if (vma->vm_file != vmg->file) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; struct vm_area_struct *src = vmg->middle; /* existing merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; /* * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we * will remove the existing VMA's anon_vma's so there's no scalability * concerns. */ VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ if (!tgt_anon && src_anon) { struct vm_area_struct *copied_from = vmg->copied_from; if (vma_is_fork_child(src)) return false; if (vma_is_fork_child(copied_from)) return false; return true; } /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) return !vma_is_fork_child(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @vmg: The merge state that will be used to determine adjustment and VMA * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; struct vm_area_struct **remove = &vp->remove; memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; if (vmg && vmg->__remove_middle) { *remove = vmg->middle; remove = &vp->remove2; } if (vmg && vmg->__remove_next) *remove = vmg->next; if (vmg && vmg->__adjust_middle_start) adjust = vmg->middle; else if (vmg && vmg->__adjust_next_start) adjust = vmg->next; else adjust = NULL; vp->adj_next = adjust; if (!vp->anon_vma && adjust) vp->anon_vma = adjust->anon_vma; VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; if (vmg && vmg->skip_vma_uprobe) vp->skip_vma_uprobe = true; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store_new(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); if (!vp->skip_vma_uprobe) { uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } } if (vp->remove) { again: vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL); } /* * Can the proposed VMA be merged with the left (previous) VMA taking into * account the start position of the proposed range. */ static bool can_vma_merge_left(struct vma_merge_struct *vmg) { return vmg->prev && vmg->prev->vm_end == vmg->start && can_vma_merge_after(vmg); } /* * Can the proposed VMA be merged with the right (next) VMA taking into * account the end position of the proposed range. * * In addition, if we can merge with the left VMA, ensure that left and right * anon_vma's are also compatible. */ static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { struct vm_area_struct *next = vmg->next; struct vm_area_struct *prev; if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) return true; /* * If we can merge with prev (left) and next (right), indicating that * each VMA's anon_vma is compatible with the proposed anon_vma, this * does not mean prev and next are compatible with EACH OTHER. * * We therefore check this in addition to mergeability to either side. */ prev = vmg->prev; return !prev->anon_vma || !next->anon_vma || prev->anon_vma == next->anon_vma; } /* * Close a vm structure and free it. */ void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ void unmap_region(struct unmap_desc *unmap) { struct mm_struct *mm = unmap->first->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, unmap); mas_set(unmap->mas, unmap->tree_reset); free_pgtables(&tlb, unmap); tlb_finish_mmu(&tlb); } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; int err; WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) return err; } new = vm_area_dup(vma); if (!new) return -ENOMEM; if (new_below) { new->vm_end = addr; } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = -ENOMEM; vma_iter_config(vmi, new->vm_start, new->vm_end); if (vma_iter_prealloc(vmi, new)) goto out_free_vma; err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; if (new->vm_file) get_file(new->vm_file); if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); vma_start_write(vma); vma_start_write(new); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); /* * Get rid of huge pages and shared page tables straddling the split * boundary. */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); if (is_vm_hugetlb_page(vma)) hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; } else { vma->vm_end = addr; } /* vma_complete stores the new vma */ vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); /* Success. */ if (new_below) vma_next(vmi); else vma_prev(vmi); return 0; out_free_mpol: mpol_put(vma_policy(new)); out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); return err; } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); } /* * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the * instance that the destination VMA has no anon_vma but the source does. * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* * There are three cases to consider for correctly propagating * anon_vma's on merge. * * The first is trivial - neither VMA has anon_vma, we need not do * anything. * * The second where both have anon_vma is also a no-op, as they must * then be the same, so there is simply nothing to copy. * * Here we cover the third - if the destination VMA has no anon_vma, * that is it is unfaulted, we need to ensure that the newly merged * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; *dup = dst; } return 0; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif /* Check for a infinite loop */ if (++i > mm->map_count + 10) { i = -1; break; } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * Based on the vmg flag indicating whether we need to adjust the vm_start field * for the middle or next VMA, we calculate what the range of the newly adjusted * VMA ought to be, and set the VMA's range accordingly. */ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; pgoff_t pgoff; if (vmg->__adjust_middle_start) { adjust = vmg->middle; pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); } else if (vmg->__adjust_next_start) { adjust = vmg->next; pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); } else { return; } vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); } /* * Actually perform the VMA merge operation. * * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not * modify any VMAs or cause inconsistent state should an OOM condition arise. * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) { struct vm_area_struct *vma; struct vma_prepare vp; if (vmg->__adjust_next_start) { /* We manipulate middle and adjust next, which is the target. */ vma = vmg->middle; vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { vma = vmg->target; /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } init_multi_vma_prep(&vp, vma, vmg); /* * If vmg->give_up_on_oom is set, we're safe, because we don't actually * manipulate any VMAs until we succeed at preallocation. * * Past this point, we will not return an error. */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; vma_prepare(&vp); /* * THP pages may need to do additional splits if we increase * middle->vm_start. */ vma_adjust_trans_huge(vma, vmg->start, vmg->end, vmg->__adjust_middle_start ? vmg->middle : NULL); vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); vmg_adjust_set_range(vmg); vma_iter_store_overwrite(vmg->vmi, vmg->target); vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ static bool can_merge_remove_vma(struct vm_area_struct *vma) { return !vma->vm_ops || !vma->vm_ops->close; } /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. * * @vmg: Describes the modifications being made to a VMA and associated * metadata. * * When the attributes of a range within a VMA change, then it might be possible * for immediately adjacent VMAs to be merged into that VMA due to having * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates * the maple tree describing the @vmg->middle->vm_mm address space to account * for this, as well as any VMAs shrunk/expanded/deleted as a result of this * merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent * calls to this function should reset these fields. * * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: * - The caller must assign the VMA to be modified to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). */ static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; bool left_side = middle && start == middle->vm_start; bool right_side = middle && end == middle->vm_end; int err = 0; bool merge_left, merge_right, merge_both; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); VM_WARN_ON_VMG(start >= end, vmg); /* * If middle == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ VM_WARN_ON_VMG(middle && ((middle != prev && vmg->start != middle->vm_start) || vmg->end > middle->vm_end), vmg); /* The vmi must be positioned within vmg->middle. */ VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); /* An existing merge can never be used by the mremap() logic. */ VM_WARN_ON_VMG(vmg->copied_from, vmg); vmg->state = VMA_MERGE_NOMERGE; /* * If a special mapping or if the range being modified is neither at the * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) merge_left = can_vma_merge_left(vmg); else merge_left = false; if (right_side) { next = vmg->next = vma_iter_next_range(vmg->vmi); vma_iter_prev_range(vmg->vmi); merge_right = can_vma_merge_right(vmg, merge_left); } else { merge_right = false; next = NULL; } if (merge_left) /* If merging prev, position iterator there. */ vma_prev(vmg->vmi); else if (!merge_right) /* If we have nothing to merge, abort. */ return NULL; merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ vmg->__remove_middle = left_side && right_side; /* * If we need to remove middle in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ if (vmg->__remove_middle && !can_merge_remove_vma(middle)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ vmg->__remove_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging * prev and middle (thereby deleting middle). */ if (vmg->__remove_next && !can_merge_remove_vma(next)) { vmg->__remove_next = false; merge_right = false; merge_both = false; } /* No matter what happens, we will be adjusting middle. */ vma_start_write(middle); if (merge_right) { vma_start_write(next); vmg->target = next; sticky_flags |= (next->vm_flags & VM_STICKY); } if (merge_left) { vma_start_write(prev); vmg->target = prev; sticky_flags |= (prev->vm_flags & VM_STICKY); } if (merge_both) { /* * |<-------------------->| * |-------********-------| * prev middle next * extend delete delete */ vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of * next or middle contains the anon_vma we must duplicate. */ err = dup_anon_vma(prev, next->anon_vma ? next : middle, &anon_dup); } else if (merge_left) { /* * |<------------>| OR * |<----------------->| * |-------************* * prev middle * extend shrink/delete */ vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; if (!vmg->__remove_middle) vmg->__adjust_middle_start = true; err = dup_anon_vma(prev, middle, &anon_dup); } else { /* merge_right */ /* * |<------------->| OR * |<----------------->| * *************-------| * middle next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be middle. */ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); if (vmg->__remove_middle) { vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { /* We shrink middle and expand next. */ vmg->__adjust_next_start = true; vmg->start = middle->vm_start; vmg->end = start; vmg->pgoff = middle->vm_pgoff; } err = dup_anon_vma(next, middle, &anon_dup); } if (err || commit_merge(vmg)) goto abort; vm_flags_set(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); if (anon_dup) unlink_anon_vmas(anon_dup); /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the * user doesn't want this reported and instead just wants to give up on * the merge, allow it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } /* * vma_merge_new_range - Attempt to merge a new VMA into address space * * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end * (exclusive), which we try to merge with any adjacent VMAs if possible. * * We are about to add a VMA to the address space starting at @vmg->start and * ending at @vmg->end. There are three different possible scenarios: * * 1. There is a VMA with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) either before or after it - * EXPAND that VMA: * * Proposed: |-----| or |-----| * Existing: |----| |----| * * 2. There are VMAs with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - * EXPAND the former and REMOVE the latter: * * Proposed: |-----| * Existing: |----| |----| * * 3. There are no VMAs immediately adjacent to the proposed new VMA or those * VMAs do not have identical attributes - NO MERGE POSSIBLE. * * In instances where we can merge, this function returns the expanded VMA which * will have its range adjusted accordingly and the underlying maple tree also * adjusted. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * This function adjusts @vmg to provide @vmg->next if not already specified, * and adjusts [@vmg->start, @vmg->end) to span the expanded range. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have determined that [@vmg->start, @vmg->end) is empty, other than VMAs that will be unmapped should the operation succeed. * - The caller must have specified the previous vma in @vmg->prev. * - The caller must have specified the next vma in @vmg->next. * - The caller must have positioned the vmi at or before the gap. */ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); VM_WARN_ON_VMG(vmg->target, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; vmg->target = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; vmg->target = prev; vmg->pgoff = prev->vm_pgoff; /* * If this merge would result in removal of the next VMA but we * are not permitted to do so, reduce the operation to merging * prev and vma. */ if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; /* In expand-only case we are already positioned at prev. */ if (!vmg->just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } } /* * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ if (vmg->target && !vma_expand(vmg)) { khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; } return NULL; } /* * vma_merge_copied_range - Attempt to merge a VMA that is being copied by * mremap() * * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if * possible. * * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO * range, i.e. the target range for the VMA. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain * the copied-from VMA. */ static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg) { /* We must have a copied-from VMA. */ VM_WARN_ON_VMG(!vmg->middle, vmg); vmg->copied_from = vmg->middle; vmg->middle = NULL; return vma_merge_new_range(vmg); } /* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. * Will expand over vmg->next if it's different from vmg->target and vmg->end == * vmg->next->vm_end. Checking if the vmg->target can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have set @vmg->target and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; vm_flags_t sticky_flags; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); if (next && target != next && vmg->end == next->vm_end) remove_next = true; /* We must have a target. */ VM_WARN_ON_VMG(!target, vmg); /* This should have already been checked by this point. */ VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != target && vmg->end > next->vm_start, vmg); /* Only handles expanding. */ VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); sticky_flags = vmg->vm_flags & VM_STICKY; sticky_flags |= target->vm_flags & VM_STICKY; if (remove_next) sticky_flags |= next->vm_flags & VM_STICKY; /* * If we are removing the next VMA or copying from a VMA * (e.g. mremap()'ing), we must propagate anon_vma state. * * Note that, by convention, callers ignore OOM for this case, so * we don't need to account for vmg->give_up_on_mm here. */ if (remove_next) ret = dup_anon_vma(target, next, &anon_dup); if (!ret && vmg->copied_from) ret = dup_anon_vma(target, vmg->copied_from, &anon_dup); if (ret) return ret; if (remove_next) { vma_start_write(next); vmg->__remove_next = true; } if (commit_merge(vmg)) goto nomem; vm_flags_set(target, sticky_flags); return 0; nomem: if (anon_dup) unlink_anon_vmas(anon_dup); /* * If the user requests that we just give upon OOM, we are safe to do so * here, as commit merge provides this contract to us. Nothing has been * changed - no harm no foul, just don't report it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); return 0; } static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct unmap_desc unmap = { .mas = mas_detach, .first = vms->vma, /* start and end may be different if there is no prev or next vma. */ .pg_start = vms->unmap_start, .pg_end = vms->unmap_end, .vma_start = vms->start, .vma_end = vms->end, /* * The tree limits and reset differ from the normal case since it's a * side-tree */ .tree_reset = 1, .tree_end = vms->vma_count, /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ .mm_wr_locked = mm_wr_locked, }; if (!vms->clear_ptes) /* Nothing to do */ return; mas_set(mas_detach, 1); unmap_region(&unmap); vms->clear_ptes = false; } static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_close(vma); } /* * vms_complete_munmap_vmas() - Finish the munmap() operation * @vms: The vma munmap struct * @mas_detach: The maple state of the detached vmas * * This updates the mm_struct, unmaps the region, frees the resources * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; struct mm_struct *mm; mm = current->mm; mm->map_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); /* Paranoid bookkeeping */ VM_WARN_ON(vms->exec_vm > mm->exec_vm); VM_WARN_ON(vms->stack_vm > mm->stack_vm); VM_WARN_ON(vms->data_vm > mm->data_vm); mm->exec_vm -= vms->exec_vm; mm->stack_vm -= vms->stack_vm; mm->data_vm -= vms->data_vm; /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); if (vms->unlock) mmap_read_unlock(mm); __mt_destroy(mas_detach->tree); } /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary * and marking the vmas as isolated. * * @vms: The vma munmap struct * @mas_detach: The maple state tracking the detached tree * * Return: 0 on success, error otherwise */ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; int error; /* * If we need to split any vma, do it now to save pain later. * Does it split the first one? */ if (vms->start > vms->vma->vm_start) { /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && vms->vma->vm_mm->map_count >= sysctl_max_map_count) { error = -ENOMEM; goto map_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ if (vma_is_sealed(vms->vma)) { error = -EPERM; goto start_split_failed; } error = __split_vma(vms->vmi, vms->vma, vms->start, 1); if (error) goto start_split_failed; } vms->prev = vma_prev(vms->vmi); if (vms->prev) vms->unmap_start = vms->prev->vm_end; /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; if (vma_is_sealed(next)) { error = -EPERM; goto modify_vma_failed; } /* Does it split the end? */ if (next->vm_end > vms->end) { error = __split_vma(vms->vmi, next, vms->end, 0); if (error) goto end_split_failed; } vma_start_write(next); mas_set(mas_detach, vms->vma_count++); error = mas_store_gfp(mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; if (next->vm_flags & VM_LOCKED) vms->locked_vm += nrpages; if (next->vm_flags & VM_ACCOUNT) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(next, vms->start, vms->end, vms->uf); if (error) goto userfaultfd_error; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < vms->start); BUG_ON(next->vm_start > vms->end); #endif } vms->next = vma_next(vms->vmi); if (vms->next) vms->unmap_end = vms->next->vm_start; #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { MA_STATE(test, mas_detach->tree, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vms->vmi, vms->start); rcu_read_lock(); vma_test = mas_find(&test, vms->vma_count - 1); for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, vms->vma_count - 1); } rcu_read_unlock(); BUG_ON(vms->vma_count != test_count); } #endif while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); vms->clear_ptes = true; return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: map_count_exceeded: return error; } /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { struct maple_tree mt_detach; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); struct vma_munmap_struct vms; int error; init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); error = vms_gather_munmap_vmas(&vms, &mas_detach); if (error) goto gather_failed; error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; /* Point of no return */ vms_complete_munmap_vmas(&vms, &mas_detach); return 0; clear_tree_failed: reattach_vmas(&mas_detach); gather_failed: validate_mm(mm); return error; } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { unsigned long end; struct vm_area_struct *vma; if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; end = start + PAGE_ALIGN(len); if (end == start) return -EINVAL; /* Find the first overlapping VMA */ vma = vma_find(vmi, end); if (!vma) { if (unlock) mmap_write_unlock(mm); return 0; } return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->middle; unsigned long start = vmg->start; unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); /* * Split can fail for reasons other than OOM, so if the user requests * this it's probably a mistake. */ VM_WARN_ON(vmg->give_up_on_oom && (vma->vm_start != start || vma->vm_end != end)); /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ if (vma->vm_end > end) { int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); } return vma; } struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t *vm_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); const vm_flags_t vm_flags = *vm_flags_ptr; struct vm_area_struct *ret; vmg.vm_flags = vm_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) return ret; /* * For a merge to succeed, the flags must match those * requested. However, sticky flags may have been retained, so propagate * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) *vm_flags_ptr = ret->vm_flags; return ret; } struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.anon_name = new_name; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.policy = new_pol; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.vm_flags = vm_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; return vma_modify(&vmg); } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { vb->count = 0; } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { struct address_space *mapping; int i; mapping = vb->vmas[0]->vm_file->f_mapping; i_mmap_lock_write(mapping); for (i = 0; i < vb->count; i++) { VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); __remove_shared_vm_struct(vb->vmas[i], mapping); } i_mmap_unlock_write(mapping); unlink_file_vma_batch_init(vb); } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { if (vma->vm_file == NULL) return; if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || vb->count == ARRAY_SIZE(vb->vmas)) unlink_file_vma_batch_process(vb); vb->vmas[vb->count] = vma; vb->count++; } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { if (vb->count > 0) unlink_file_vma_batch_process(vb); } static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; if (file) { mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); if (!hold_rmap_lock) i_mmap_unlock_write(mapping); } } static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store_new(&vmi, vma); vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } /* * If the VMA we are copying might contain a uprobe PTE, ensure * that we do not establish one upon merge. Otherwise, when mremap() * moves page tables, it will orphan the newly created PTE. */ if (vma->vm_file) vmg.skip_vma_uprobe = true; new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_copied_range(&vmg); if (new_vma) { /* * Source vma may have been merged into new_vma */ if (unlikely(vma_start >= new_vma->vm_start && vma_start < new_vma->vm_end)) { /* * The only way we can get a vma_merge with * self during an mremap is if the vma hasn't * been faulted in yet and we were allowed to * reset the dst vma->vm_pgoff to the * destination address of the mremap to allow * the merge to happen. mremap must change the * vm_pgoff linearity between src and dst vmas * (in turn preventing a vma_merge) to be * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = vm_area_dup(vma); if (!new_vma) goto out; vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } return new_vma; out_vma_link: fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: return NULL; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change from under us because * we hold the mm_all_locks_mutex. * * Operations on ->flags have to be atomic because * even if AS_MM_ALL_LOCKS is stable thanks to the * mm_all_locks_mutex, there may be other cpus * changing other bitflags in parallel to us. */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); /* * vma_start_write() does not have a complement in mm_drop_all_locks() * because vma_start_write() is always asymmetrical; it marks a VMA as * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_lock_anon_vma(mm, avc->anon_vma); } return 0; out_unlock: mm_drop_all_locks(mm); return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } } static void vm_unlock_mapping(struct address_space *mapping) { if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); } } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } mutex_unlock(&mm_all_locks_mutex); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } static void update_ksm_flags(struct mmap_state *map) { map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } static void set_desc_from_map(struct vm_area_desc *desc, const struct mmap_state *map) { desc->start = map->addr; desc->end = map->end; desc->pgoff = map->pgoff; desc->vm_file = map->file; desc->vma_flags = map->vma_flags; desc->page_prot = map->page_prot; } /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * * As a result it sets up the @map and @desc objects. * * @map: Mapping state. * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; struct vma_munmap_struct *vms = &map->vms; /* Find the first overlapping VMA and initialise unmap state. */ vms->vma = vma_find(vmi, map->end); init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, /* unlock = */ false); /* OK, we have overlapping VMAs - prepare to unmap them. */ if (vms->vma) { mt_init_flags(&map->mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(map->mt_detach); mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ error = vms_gather_munmap_vmas(vms, &map->mas_detach); if (error) { /* On error VMAs will already have been reattached. */ vms->nr_pages = 0; return error; } map->next = vms->next; map->prev = vms->prev; } else { map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ if (accountable_mapping(map->file, map->vm_flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) return error; } vms->nr_accounted = 0; map->vm_flags |= VM_ACCOUNT; } /* * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ vms_clean_up_area(vms, &map->mas_detach); set_desc_from_map(desc, map); return 0; } static int __mmap_new_file_vma(struct mmap_state *map, struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; int error; vma->vm_file = map->file; if (!map->file_doesnt_need_get) get_file(map->file); if (!map->file->f_op->mmap) return 0; error = mmap_file(vma->vm_file, vma); if (error) { UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end, map->prev, map->next); fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&unmap); return error; } /* Drivers cannot alter the address of the VMA. */ WARN_ON_ONCE(map->addr != vma->vm_start); /* * Drivers should not permit writability when previously it was * disallowed. */ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && !(map->vm_flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); map->file = vma->vm_file; map->vm_flags = vma->vm_flags; return 0; } /* * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * * Returns: Zero on success, or an error. */ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; struct vm_area_struct *vma; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(map->mm); if (!vma) return -ENOMEM; vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->vm_flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } if (map->file) error = __mmap_new_file_vma(map, vma); else if (map->vm_flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); if (error) goto free_iter_vma; if (!map->check_ksm_early) { update_ksm_flags(map); vm_flags_init(vma, map->vm_flags); } #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) khugepaged_enter_vma(vma, map->vm_flags); *vmap = vma; return 0; free_iter_vma: vma_iter_free(vmi); free_vma: vm_area_free(vma); return error; } /* * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping * statistics, handle locking and finalise the VMA. * * @map: Mapping state. * @vma: Merged or newly allocated VMA for the mmap()'d region. */ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); /* Unmap any existing mapping in the area. */ vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; } if (vma->vm_file) uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ if (pgtable_supports_soft_dirty()) vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } static void call_action_prepare(struct mmap_state *map, struct vm_area_desc *desc) { struct mmap_action *action = &desc->action; mmap_action_prepare(action, desc); if (action->hide_from_rmap_until_complete) map->hold_file_rmap_lock = true; } /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. * * This is called prior to any merge attempt, and updates whitelisted fields * that are permitted to be updated by the caller. * * All but user-defined fields will be pre-populated with original values. * * Returns 0 on success, or an error code otherwise. */ static int call_mmap_prepare(struct mmap_state *map, struct vm_area_desc *desc) { int err; /* Invoke the hook. */ err = vfs_mmap_prepare(map->file, desc); if (err) return err; call_action_prepare(map, desc); /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; if (desc->vm_file != map->file) { map->file_doesnt_need_get = true; map->file = desc->vm_file; } map->vma_flags = desc->vma_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ map->vm_ops = desc->vm_ops; map->vm_private_data = desc->private_data; return 0; } static void set_vma_user_defined_fields(struct vm_area_struct *vma, struct mmap_state *map) { if (map->vm_ops) vma->vm_ops = map->vm_ops; vma->vm_private_data = map->vm_private_data; } /* * Are we guaranteed no driver can change state such as to preclude KSM merging? * If so, let's set the KSM mergeable flag early so we don't break VMA merging. */ static bool can_set_ksm_flags_early(struct mmap_state *map) { struct file *file = map->file; /* Anonymous mappings have no driver which can change them. */ if (!file) return true; /* * If .mmap_prepare() is specified, then the driver will have already * manipulated state prior to updating KSM flags. So no need to worry * about mmap callbacks modifying VMA flags after the KSM flag has been * updated here, which could otherwise affect KSM eligibility. */ if (file->f_op->mmap_prepare) return true; /* shmem is safe. */ if (shmem_file(file)) return true; /* Any other .mmap callback is not safe. */ return false; } static int call_action_complete(struct mmap_state *map, struct vm_area_desc *desc, struct vm_area_struct *vma) { struct mmap_action *action = &desc->action; int ret; ret = mmap_action_complete(action, vma); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { struct file *file = vma->vm_file; i_mmap_unlock_write(file->f_mapping); } return ret; } static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, .action = { .type = MMAP_NOTHING, /* Default to no further action. */ }, }; bool allocated_new = false; int error; map.check_ksm_early = can_set_ksm_flags_early(&map); error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; if (map.check_ksm_early) update_ksm_flags(&map); /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); vma = vma_merge_new_range(&vmg); } /* ...but if we can't, allocate a new VMA. */ if (!vma) { error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; allocated_new = true; } if (have_mmap_prepare) set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { error = call_action_complete(&map, &desc, vma); if (error) return error; } return addr; /* Accounting was done by __mmap_setup(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } /** * mmap_region() - Actually perform the userland mapping of a VMA into * current->mm with known, aligned and overflow-checked @addr and @len, and * correctly determined VMA flags @vm_flags and page offset @pgoff. * * This is an internal memory management function, and should not be used * directly. * * The caller must write-lock current->mm->mmap_lock. * * @file: If a file-backed mapping, a pointer to the struct file describing the * file to be mapped, otherwise NULL. * @addr: The page-aligned address at which to perform the mapping. * @len: The page-aligned, non-zero, length of the mapping. * @vm_flags: The VMA flags which should be applied to the mapping. * @pgoff: If @file is specified, the page offset into the file, if not then * the virtual page offset in memory of the anonymous mapping. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap * events. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ if (map_deny_write_exec(vm_flags, vm_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ if (!arch_validate_flags(vm_flags)) return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ if (file && is_shared_maywrite_vm_flags(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) return error; writable_file_mapping = true; } ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); validate_mm(current->mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @vm_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; vm_flags = ksm_vma_flags(mm, NULL, vm_flags); if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ vmg.just_expand = true; if (vma_merge_new_range(&vmg)) goto out; else if (vmg_nomem(&vmg)) goto unacct_fail; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; /* * Adjust for the gap first so it doesn't interfere with the later * alignment. The first step is the minimum needed to fulfill the start * gap, the next step is the minimum to align that. It is the minimum * needed to fulfill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); vma_iter_reset(&vmi); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; vma_iter_reset(&vmi); goto retry; } } return gap; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; mmap_assert_write_locked(mm); /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; mmap_assert_write_locked(mm); address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } if (prev) vma_iter_next_range_limit(&vmi, vma->vm_start); vma_iter_config(&vmi, address, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { unsigned long size, grow; size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; error = -ENOMEM; if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; }
80 40 40 5 6 17 63 64 64 64 44 44 43 19 19 19 19 19 10 10 10 10 5 5 5 4 4 4 2 2 2 3 3 3 40 40 40 40 40 20 20 20 20 20 20 20 40 40 40 40 39 48 49 49 49 49 5 5 50 50 51 51 2 2 2 10 10 10 43 44 44 2 2 49 49 49 49 49 49 16 16 16 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018-2019, 2021-2025 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS #include <linux/fips.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #define check_sdata_in_driver(sdata) ({ \ WARN_ONCE(!sdata->local->reconfig_failure && \ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ !!(sdata->flags & IEEE80211_SDATA_IN_DRIVER); \ }) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) { if (sdata && sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata; } static inline void drv_tx(struct ieee80211_local *local, struct ieee80211_tx_control *control, struct sk_buff *skb) { local->ops->tx(&local->hw, control, skb); } static inline void drv_sync_rx_queues(struct ieee80211_local *local, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->sync_rx_queues) { trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta); local->ops->sync_rx_queues(&local->hw); trace_drv_return_void(local); } } static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata, u32 sset, u8 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_strings) { trace_drv_get_et_strings(local, sset); local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data); trace_drv_return_void(local); } } static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata, struct ethtool_stats *stats, u64 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_stats) { trace_drv_get_et_stats(local); local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data); trace_drv_return_void(local); } } static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata, int sset) { struct ieee80211_local *local = sdata->local; int rv = 0; if (local->ops->get_et_sset_count) { trace_drv_get_et_sset_count(local, sset); rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif, sset); trace_drv_return_int(local, rv); } return rv; } int drv_start(struct ieee80211_local *local); void drv_stop(struct ieee80211_local *local, bool suspend); #ifdef CONFIG_PM static inline int drv_suspend(struct ieee80211_local *local, struct cfg80211_wowlan *wowlan) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_suspend(local); ret = local->ops->suspend(&local->hw, wowlan); trace_drv_return_int(local, ret); return ret; } static inline int drv_resume(struct ieee80211_local *local) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_resume(local); ret = local->ops->resume(&local->hw); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_wakeup(struct ieee80211_local *local, bool enabled) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->set_wakeup) return; trace_drv_set_wakeup(local, enabled); local->ops->set_wakeup(&local->hw, enabled); trace_drv_return_void(local); } #endif int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p); void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, int radio_idx, u32 changed) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_config(local, radio_idx, changed); ret = local->ops->config(&local->hw, radio_idx, changed); trace_drv_return_int(local, ret); return ret; } static inline void drv_vif_cfg_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_vif_cfg_changed(local, sdata, changed); if (local->ops->vif_cfg_changed) local->ops->vif_cfg_changed(&local->hw, &sdata->vif, changed); else if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, &sdata->vif.bss_conf, changed); trace_drv_return_void(local); } void drv_link_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, int link_id, u64 changed); static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { u64 ret = 0; trace_drv_prepare_multicast(local, mc_list->count); if (local->ops->prepare_multicast) ret = local->ops->prepare_multicast(&local->hw, mc_list); trace_drv_return_u64(local, ret); return ret; } static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_configure_filter(local, changed_flags, total_flags, multicast); local->ops->configure_filter(&local->hw, changed_flags, total_flags, multicast); trace_drv_return_void(local); } static inline void drv_config_iface_filter(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_config_iface_filter(local, sdata, filter_flags, changed_flags); if (local->ops->config_iface_filter) local->ops->config_iface_filter(&local->hw, &sdata->vif, filter_flags, changed_flags); trace_drv_return_void(local); } static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { int ret = 0; trace_drv_set_tim(local, sta, set); if (local->ops->set_tim) ret = local->ops->set_tim(&local->hw, sta, set); trace_drv_return_int(local, ret); return ret; } int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); static inline void drv_update_tkip_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct sta_info *sta, u32 iv32, u16 *phase1key) { struct ieee80211_sta *ista = NULL; if (sta) ista = &sta->sta; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); if (local->ops->update_tkip_key) local->ops->update_tkip_key(&local->hw, &sdata->vif, conf, ista, iv32, phase1key); trace_drv_return_void(local); } static inline int drv_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_request *req) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_hw_scan(local, sdata); ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); trace_drv_return_int(local, ret); return ret; } static inline void drv_cancel_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_cancel_hw_scan(local, sdata); local->ops->cancel_hw_scan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_sched_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_start(local, sdata); ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, req, ies); trace_drv_return_int(local, ret); return ret; } static inline int drv_sched_scan_stop(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_stop(local, sdata); ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_sw_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *mac_addr) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_sw_scan_start(local, sdata, mac_addr); if (local->ops->sw_scan_start) local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr); trace_drv_return_void(local); } static inline void drv_sw_scan_complete(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_sw_scan_complete(local, sdata); if (local->ops->sw_scan_complete) local->ops->sw_scan_complete(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_get_stats(struct ieee80211_local *local, struct ieee80211_low_level_stats *stats) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_stats) ret = local->ops->get_stats(&local->hw, stats); trace_drv_get_stats(local, stats, ret); return ret; } static inline void drv_get_key_seq(struct ieee80211_local *local, struct ieee80211_key *key, struct ieee80211_key_seq *seq) { if (local->ops->get_key_seq) local->ops->get_key_seq(&local->hw, &key->conf, seq); trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_frag_threshold(local, radio_idx, value); if (local->ops->set_frag_threshold) ret = local->ops->set_frag_threshold(&local->hw, radio_idx, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_rts_threshold(local, radio_idx, value); if (local->ops->set_rts_threshold) ret = local->ops->set_rts_threshold(&local->hw, radio_idx, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, int radio_idx, s16 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_coverage_class(local, radio_idx, value); if (local->ops->set_coverage_class) local->ops->set_coverage_class(&local->hw, radio_idx, value); else ret = -EOPNOTSUPP; trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_notify(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_notify(local, sdata, cmd, sta); if (local->ops->sta_notify) local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta); trace_drv_return_void(local); } static inline int drv_sta_add(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_add(local, sdata, sta); if (local->ops->sta_add) ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_remove(local, sdata, sta); if (local->ops->sta_remove) local->ops->sta_remove(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } #ifdef CONFIG_MAC80211_DEBUGFS static inline void drv_vif_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (sdata->vif.type == NL80211_IFTYPE_MONITOR || WARN_ON(!sdata->vif.debugfs_dir)) return; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->vif_add_debugfs) local->ops->vif_add_debugfs(&local->hw, &sdata->vif); } static inline void drv_link_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->link_add_debugfs) local->ops->link_add_debugfs(&local->hw, &sdata->vif, link_conf, dir); } static inline void drv_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->sta_add_debugfs) local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->link_sta_add_debugfs) local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif, link_sta, dir); } #else static inline void drv_vif_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); } #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta); if (local->ops->sta_pre_rcu_remove) local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); void drv_link_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_rate_tbl_update(local, sdata, sta); if (local->ops->sta_rate_tbl_update) local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_sta_statistics(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct station_info *sinfo) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_statistics(local, sdata, sta); if (local->ops->sta_statistics) local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo); trace_drv_return_void(local); } static inline void drv_link_sta_statistics(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, struct link_station_info *link_sinfo) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_link_sta_statistics(local, sdata, link_sta); if (local->ops->link_sta_statistics) local->ops->link_sta_statistics(&local->hw, &sdata->vif, link_sta, link_sinfo); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params); u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { int ret = 0; /* default unsupported op for less congestion */ might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_tx_last_beacon(local); if (local->ops->tx_last_beacon) ret = local->ops->tx_last_beacon(&local->hw); trace_drv_return_int(local, ret); return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_get_survey(local, idx, survey); if (local->ops->get_survey) ret = local->ops->get_survey(&local->hw, idx, survey); trace_drv_return_int(local, ret); return ret; } static inline void drv_rfkill_poll(struct ieee80211_local *local) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->rfkill_poll) local->ops->rfkill_poll(&local->hw); } static inline void drv_flush(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u32 queues, bool drop) { struct ieee80211_vif *vif; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); vif = sdata ? &sdata->vif : NULL; if (sdata && !check_sdata_in_driver(sdata)) return; trace_drv_flush(local, queues, drop); if (local->ops->flush) local->ops->flush(&local->hw, vif, queues, drop); trace_drv_return_void(local); } static inline void drv_flush_sta(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (sdata && !check_sdata_in_driver(sdata)) return; if (!sta->uploaded) return; trace_drv_flush_sta(local, sdata, &sta->sta); if (local->ops->flush_sta) local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } static inline void drv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_channel_switch(local, sdata, ch_switch); local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_void(local); } static inline int drv_set_antenna(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->set_antenna) ret = local->ops->set_antenna(&local->hw, -1, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } static inline int drv_get_antenna(struct ieee80211_local *local, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_antenna) ret = local->ops->get_antenna(&local->hw, radio_idx, tx_ant, rx_ant); trace_drv_get_antenna(local, radio_idx, *tx_ant, *rx_ant, ret); return ret; } static inline int drv_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, unsigned int duration, enum ieee80211_roc_type type) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_remain_on_channel(local, sdata, chan, duration, type); ret = local->ops->remain_on_channel(&local->hw, &sdata->vif, chan, duration, type); trace_drv_return_int(local, ret); return ret; } static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_cancel_remain_on_channel(local, sdata); ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_ringparam(struct ieee80211_local *local, u32 tx, u32 rx) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_ringparam(local, tx, rx); if (local->ops->set_ringparam) ret = local->ops->set_ringparam(&local->hw, tx, rx); trace_drv_return_int(local, ret); return ret; } static inline void drv_get_ringparam(struct ieee80211_local *local, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max); if (local->ops->get_ringparam) local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max); trace_drv_return_void(local); } static inline bool drv_tx_frames_pending(struct ieee80211_local *local) { bool ret = false; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_tx_frames_pending(local); if (local->ops->tx_frames_pending) ret = local->ops->tx_frames_pending(&local->hw); trace_drv_return_bool(local, ret); return ret; } static inline int drv_set_bitrate_mask(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_bitrate_mask *mask) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_bitrate_mask(local, sdata, mask); if (local->ops->set_bitrate_mask) ret = local->ops->set_bitrate_mask(&local->hw, &sdata->vif, mask); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_rekey_data(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (fips_enabled) return; trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) local->ops->set_rekey_data(&local->hw, &sdata->vif, data); trace_drv_return_void(local); } static inline void drv_event_callback(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct ieee80211_event *event) { trace_drv_event_callback(local, sdata, event); if (local->ops->event_callback) local->ops->event_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } static inline void drv_release_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->release_buffered_frames) local->ops->release_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_allow_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->allow_buffered_frames) local->ops->allow_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); info->link_id = info->link_id < 0 ? 0 : info->link_id; trace_drv_mgd_prepare_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_prepare_tx) local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_complete_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); info->link_id = info->link_id < 0 ? 0 : info->link_id; trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_protect_tdls_discover(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int link_id) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); link_id = link_id > 0 ? link_id : 0; trace_drv_mgd_protect_tdls_discover(local, sdata); if (local->ops->mgd_protect_tdls_discover) local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif, link_id); trace_drv_return_void(local); } static inline int drv_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_add_chanctx(local, ctx); if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); trace_drv_return_int(local, ret); if (!ret) ctx->driver_present = true; return ret; } static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!ctx->driver_present)) return; trace_drv_remove_chanctx(local, ctx); if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); trace_drv_return_void(local); ctx->driver_present = false; } static inline void drv_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_change_chanctx(local, ctx, changed); if (local->ops->change_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->change_chanctx(&local->hw, &ctx->conf, changed); } trace_drv_return_void(local); } int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx); void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx); int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_ap(local, sdata, link_conf); if (local->ops->start_ap) ret = local->ops->start_ap(&local->hw, &sdata->vif, link_conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_stop_ap(local, sdata, link_conf); if (local->ops->stop_ap) local->ops->stop_ap(&local->hw, &sdata->vif, link_conf); trace_drv_return_void(local); } static inline void drv_reconfig_complete(struct ieee80211_local *local, enum ieee80211_reconfig_type reconfig_type) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_reconfig_complete(local, reconfig_type); if (local->ops->reconfig_complete) local->ops->reconfig_complete(&local->hw, reconfig_type); trace_drv_return_void(local); } static inline void drv_set_default_unicast_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(key_idx < -1 || key_idx > 3); trace_drv_set_default_unicast_key(local, sdata, key_idx); if (local->ops->set_default_unicast_key) local->ops->set_default_unicast_key(&local->hw, &sdata->vif, key_idx); trace_drv_return_void(local); } #if IS_ENABLED(CONFIG_IPV6) static inline void drv_ipv6_addr_change(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct inet6_dev *idev) { trace_drv_ipv6_addr_change(local, sdata); if (local->ops->ipv6_addr_change) local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev); trace_drv_return_void(local); } #endif static inline void drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->channel_switch_beacon) { trace_drv_channel_switch_beacon(local, sdata, chandef); local->ops->channel_switch_beacon(&local->hw, &sdata->vif, chandef); } } static inline int drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id)) return 0; trace_drv_pre_channel_switch(local, sdata, ch_switch); if (local->ops->pre_channel_switch) ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_int(local, ret); return ret; } static inline int drv_post_channel_switch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; trace_drv_post_channel_switch(local, sdata); if (local->ops->post_channel_switch) ret = local->ops->post_channel_switch(&local->hw, &sdata->vif, link->conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_channel_switch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) local->ops->abort_channel_switch(&local->hw, &sdata->vif, link->conf); } static inline void drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id)) return; trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch); if (local->ops->channel_switch_rx_beacon) local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif, ch_switch); } static inline int drv_join_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf); if (local->ops->join_ibss) ret = local->ops->join_ibss(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_leave_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_leave_ibss(local, sdata); if (local->ops->leave_ibss) local->ops->leave_ibss(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, struct sta_info *sta) { u32 ret = 0; trace_drv_get_expected_throughput(&sta->sta); if (local->ops->get_expected_throughput && sta->uploaded) ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; } static inline int drv_get_txpower(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int link_id, int *dbm) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->get_txpower) return -EOPNOTSUPP; ret = local->ops->get_txpower(&local->hw, &sdata->vif, link_id, dbm); trace_drv_get_txpower(local, sdata, link_id, *dbm, ret); return ret; } static inline int drv_tdls_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!local->ops->tdls_channel_switch) return -EOPNOTSUPP; trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef); ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta, oper_class, chandef, tmpl_skb, ch_sw_tm_ie); trace_drv_return_int(local, ret); return ret; } static inline void drv_tdls_cancel_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->tdls_cancel_channel_switch) return; trace_drv_tdls_cancel_channel_switch(local, sdata, sta); local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_tdls_recv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_ch_sw_params *params) { trace_drv_tdls_recv_channel_switch(local, sdata, params); if (local->ops->tdls_recv_channel_switch) local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif, params); trace_drv_return_void(local); } static inline void drv_wake_tx_queue(struct ieee80211_local *local, struct txq_info *txq) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); /* In reconfig don't transmit now, but mark for waking later */ if (local->in_reconfig) { set_bit(IEEE80211_TXQ_DIRTY, &txq->flags); return; } if (!check_sdata_in_driver(sdata)) return; trace_drv_wake_tx_queue(local, sdata, txq); local->ops->wake_tx_queue(&local->hw, &txq->txq); } static inline void schedule_and_wake_txq(struct ieee80211_local *local, struct txq_info *txqi) { ieee80211_schedule_txq(&local->hw, &txqi->txq); drv_wake_tx_queue(local, txqi); } static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local, struct sk_buff *head, struct sk_buff *skb) { if (!local->ops->can_aggregate_in_amsdu) return true; return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb); } static inline int drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (local->ops->get_ftm_responder_stats) ret = local->ops->get_ftm_responder_stats(&local->hw, &sdata->vif, ftm_stats); trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats); return ret; } static inline int drv_start_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_pmsr(local, sdata); if (local->ops->start_pmsr) ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { trace_drv_abort_pmsr(local, sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (local->ops->abort_pmsr) local->ops->abort_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_void(local); } static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_start_nan(local, sdata, conf); ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_stop_nan(local, sdata); local->ops->stop_nan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_nan_change_conf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf, u32 changes) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->nan_change_conf) return -EOPNOTSUPP; trace_drv_nan_change_conf(local, sdata, conf, changes); ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, changes); trace_drv_return_int(local, ret); return ret; } static inline int drv_add_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_nan_func *nan_func) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->add_nan_func) return -EOPNOTSUPP; trace_drv_add_nan_func(local, sdata, nan_func); ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); trace_drv_return_int(local, ret); return ret; } static inline void drv_del_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 instance_id) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_del_nan_func(local, sdata, instance_id); if (local->ops->del_nan_func) local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); trace_drv_return_void(local); } static inline int drv_set_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta, tid_conf); trace_drv_return_int(local, ret); return ret; } static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_vif_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->update_vif_offload) return; trace_drv_update_vif_offload(local, sdata); local->ops->update_vif_offload(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_sta_set_4addr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_4addr(local, sdata, sta, enabled); if (local->ops->sta_set_4addr) local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); if (local->ops->sta_set_decap_offload) local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_add_twt_setup(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt) { struct ieee80211_twt_params *twt_agrt; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; twt_agrt = (void *)twt->params; trace_drv_add_twt_setup(local, sta, twt, twt_agrt); local->ops->add_twt_setup(&local->hw, sta, twt); trace_drv_return_void(local); } static inline void drv_twt_teardown_request(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 flowid) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->twt_teardown_request) return; trace_drv_twt_teardown_request(local, sta, flowid); local->ops->twt_teardown_request(&local->hw, sta, flowid); trace_drv_return_void(local); } static inline int drv_net_fill_forward_path(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path) { int ret = -EOPNOTSUPP; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_net_fill_forward_path(local, sdata, sta); if (local->ops->net_fill_forward_path) ret = local->ops->net_fill_forward_path(&local->hw, &sdata->vif, sta, ctx, path); trace_drv_return_int(local, ret); return ret; } static inline int drv_net_setup_tc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct net_device *dev, enum tc_setup_type type, void *type_data) { int ret = -EOPNOTSUPP; might_sleep(); sdata = get_bss_sdata(sdata); trace_drv_net_setup_tc(local, sdata, type); if (local->ops->net_setup_tc) ret = local->ops->net_setup_tc(&local->hw, &sdata->vif, dev, type, type_data); trace_drv_return_int(local, ret); return ret; } static inline bool drv_can_activate_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 active_links) { bool ret = true; lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return false; trace_drv_can_activate_links(local, sdata, active_links); if (local->ops->can_activate_links) ret = local->ops->can_activate_links(&local->hw, &sdata->vif, active_links); trace_drv_return_bool(local, ret); return ret; } int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links); static inline enum ieee80211_neg_ttlm_res drv_can_neg_ttlm(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm *neg_ttlm) { enum ieee80211_neg_ttlm_res res = NEG_TTLM_RES_REJECT; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_can_neg_ttlm(local, sdata, neg_ttlm); if (local->ops->can_neg_ttlm) res = local->ops->can_neg_ttlm(&local->hw, &sdata->vif, neg_ttlm); trace_drv_neg_ttlm_res(local, sdata, res, neg_ttlm); return res; } static inline void drv_prep_add_interface(struct ieee80211_local *local, enum nl80211_iftype type) { trace_drv_prep_add_interface(local, type); if (local->ops->prep_add_interface) local->ops->prep_add_interface(&local->hw, type); trace_drv_return_void(local); } static inline int drv_set_eml_op_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_eml_params *eml_params) { struct ieee80211_local *local = sdata->local; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_eml_op_mode(local, sdata, sta, eml_params->link_id, eml_params->control, eml_params->link_bitmap); if (local->ops->set_eml_op_mode) ret = local->ops->set_eml_op_mode(&local->hw, &sdata->vif, sta, eml_params); trace_drv_return_int(local, ret); return ret; } #endif /* __MAC80211_DRIVER_OPS */
11 13 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for the RDMA Connect Manager. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_cma #if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CMA_H #include <linux/tracepoint.h> #include <trace/misc/rdma.h> DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos ) ); #define DEFINE_CMA_FSM_EVENT(name) \ DEFINE_EVENT(cma_fsm_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); DEFINE_CMA_FSM_EVENT(prepare_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); TRACE_EVENT(cm_id_attach, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_device *device ), TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __get_str(devname) ) ); DECLARE_EVENT_CLASS(cma_qp_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(u32, qp_num) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->qp_num = id_priv->qp_num; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->qp_num ) ); #define DEFINE_CMA_QP_EVENT(name) \ DEFINE_EVENT(cma_qp_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_QP_EVENT(send_req); DEFINE_CMA_QP_EVENT(send_rep); DEFINE_CMA_QP_EVENT(qp_destroy); /* * enum ib_wp_type, from include/rdma/ib_verbs.h */ #define IB_QP_TYPE_LIST \ ib_qp_type(SMI) \ ib_qp_type(GSI) \ ib_qp_type(RC) \ ib_qp_type(UC) \ ib_qp_type(UD) \ ib_qp_type(RAW_IPV6) \ ib_qp_type(RAW_ETHERTYPE) \ ib_qp_type(RAW_PACKET) \ ib_qp_type(XRC_INI) \ ib_qp_type_end(XRC_TGT) #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); #define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); IB_QP_TYPE_LIST #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) { IB_QPT_##x, #x }, #define ib_qp_type_end(x) { IB_QPT_##x, #x } #define rdma_show_qp_type(x) \ __print_symbolic(x, IB_QP_TYPE_LIST) TRACE_EVENT(cm_qp_create, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_pd *pd, const struct ib_qp_init_attr *qp_init_attr, int rc ), TP_ARGS(id_priv, pd, qp_init_attr, rc), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, pd_id) __field(u32, tos) __field(u32, qp_num) __field(u32, send_wr) __field(u32, recv_wr) __field(int, rc) __field(unsigned long, qp_type) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->pd_id = pd->res.id; __entry->tos = id_priv->tos; __entry->send_wr = qp_init_attr->cap.max_send_wr; __entry->recv_wr = qp_init_attr->cap.max_recv_wr; __entry->rc = rc; if (!rc) { __entry->qp_num = id_priv->qp_num; __entry->qp_type = id_priv->id.qp_type; } else { __entry->qp_num = 0; __entry->qp_type = 0; } memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" " send_wr=%u recv_wr=%u qp_num=%u rc=%d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->pd_id, rdma_show_qp_type(__entry->qp_type), __entry->send_wr, __entry->recv_wr, __entry->qp_num, __entry->rc ) ); TRACE_EVENT(cm_req_handler, TP_PROTO( const struct rdma_id_private *id_priv, int event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_ib_cm_event(__entry->event), __entry->event ) ); TRACE_EVENT(cm_event_handler, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, status) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->status = event->status; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->event, __entry->status ) ); TRACE_EVENT(cm_event_done, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event, int result ), TP_ARGS(id_priv, event, result), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, result) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->result = result; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->result ) ); DECLARE_EVENT_CLASS(cma_client_class, TP_PROTO( const struct ib_device *device ), TP_ARGS(device), TP_STRUCT__entry( __string(name, device->name) ), TP_fast_assign( __assign_str(name); ), TP_printk("device name=%s", __get_str(name) ) ); #define DEFINE_CMA_CLIENT_EVENT(name) \ DEFINE_EVENT(cma_client_class, cm_##name, \ TP_PROTO( \ const struct ib_device *device \ ), \ TP_ARGS(device)) DEFINE_CMA_CLIENT_EVENT(add_one); DEFINE_CMA_CLIENT_EVENT(remove_one); #endif /* _TRACE_RDMA_CMA_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE cma_trace #include <trace/define_trace.h>
38 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() / * tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_free_vmas() * * tlb_free_vmas() marks the start of unlinking of one or more vmas * and freeing page-tables. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / tlb_remove_page_size() * - __tlb_remove_folio_pages() / __tlb_remove_page_size() * - __tlb_remove_folio_pages_size() * * __tlb_remove_folio_pages_size() is the basic primitive that queues pages * for freeing. It will return a boolean indicating if the queue is (now) * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), * however, instead of removing a single page, assume PAGE_SIZE and remove * the given number of consecutive pages that are all part of the * same (large) folio. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor(ptdesc); tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; /* * Did we unshare (unmap) any shared page tables? For now only * used for hugetlb PMD table sharing. */ unsigned int unshared_tables : 1; /* * Did we unshare any page tables such that they are now exclusive * and could get reused+modified by the new owner? When setting this * flag, "unshared_tables" will be set as well. For now only used * for hugetlb PMD table sharing. */ unsigned int fully_unshared_tables : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; tlb->unshared_tables = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); /* * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma * in the tracked range, see tlb_free_vmas(). */ tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) return; tlb_flush(tlb); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the * page mapcount -- there might not be page-frames for these PFNs * after all. * * Specifically() there is a race between munmap() and * unmap_mapping_range(), where munmap() will unlink the VMA, such * that unmap_mapping_range() will no longer observe the VMA and * no-op, without observing the TLBI, returning prematurely. * * So if we're about to unlink such a VMA, and we have pending * TLBI for such a vma, flush things now. */ if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); } /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) { } #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for * later tlb invalidation. * * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple * consecutive ptes instead of only a single one. */ static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, pte_t *ptep, unsigned int nr, unsigned long address) { tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); for (;;) { __tlb_remove_tlb_entry(tlb, ptep, address); if (--nr == 0) break; ptep++; address += PAGE_SIZE; } } #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz >= P4D_SIZE) \ tlb_flush_p4d_range(tlb, address, _sz); \ else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ else if (_sz >= PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else \ tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef pte_needs_flush static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { return true; } #endif #ifndef huge_pmd_needs_flush static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { return true; } #endif #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, unsigned long addr) { /* * The caller must make sure that concurrent unsharing + exclusive * reuse is impossible until tlb_flush_unshared_tables() was called. */ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); ptdesc_pmd_pts_dec(pt); /* Clearing a PUD pointing at a PMD table with PMD leaves. */ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); /* * If the page table is now exclusively owned, we fully unshared * a page table. */ if (!ptdesc_pmd_is_shared(pt)) tlb->fully_unshared_tables = true; tlb->unshared_tables = true; } static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) { /* * As soon as the caller drops locks to allow for reuse of * previously-shared tables, these tables could get modified and * even reused outside of hugetlb context, so we have to make sure that * any page table walkers (incl. TLB, GUP-fast) are aware of that * change. * * Even if we are not fully unsharing a PMD table, we must * flush the TLB for the unsharer now. */ if (tlb->unshared_tables) tlb_flush_mmu_tlbonly(tlb); /* * Similarly, we must make sure that concurrent GUP-fast will not * walk previously-shared page tables that are getting modified+reused * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. * * We only perform this when we are the last sharer of a page table, * as the IPI will reach all CPUs: any GUP-fast. * * Note that on configs where tlb_remove_table_sync_one() is a NOP, * the expectation is that the tlb_flush_mmu_tlbonly() would have issued * required IPIs already for us. */ if (tlb->fully_unshared_tables) { tlb_remove_table_sync_one(); tlb->fully_unshared_tables = false; } } #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
727 1168 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 #ifndef _LINUX_HASH_H #define _LINUX_HASH_H /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ #include <asm/types.h> #include <linux/compiler.h> /* * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and * fs/inode.c. It's not actually prime any more (the previous primes * were actively bad for hashing), but the name remains. */ #if BITS_PER_LONG == 32 #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* * This hash multiplies the input by a large odd number and takes the * high bits. Since multiplication propagates changes to the most * significant end only, it is essential that the high bits of the * product be used for the hash value. * * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. (See Knuth vol 3, section 6.4, exercise 9.) * * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, * which is very slightly easier to multiply by and makes no * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull #ifdef CONFIG_HAVE_ARCH_HASH /* This header may use the GOLDEN_RATIO_xx constants */ #include <asm/hash.h> #endif /* * The _generic versions exist only so lib/test_hash.c can compare * the arch-optimized versions with the generic. * * Note that if you change these, any <asm/hash.h> that aren't updated * to match need to have their HAVE_ARCH_* define values updated so the * self-test will not false-positive. */ #ifndef HAVE_ARCH__HASH_32 #define __hash_32 __hash_32_generic #endif static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } #ifndef HAVE_ARCH_HASH_64 #define hash_64 hash_64_generic #endif static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((u32)val ^ __hash_32(val >> 32), bits); #endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } /* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; #if BITS_PER_LONG == 64 val ^= (val >> 32); #endif return (u32)val; } #endif /* _LINUX_HASH_H */
17 17 4 1 10 2 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/seg6.h> #include <linux/seg6.h> #include <linux/seg6_iptunnel.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/dst_cache.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <linux/netfilter.h> static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { int head = 0; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: head = sizeof(struct ipv6hdr); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: return 0; } return ((tuninfo->srh->hdrlen + 1) << 3) + head; } struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; }; static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct seg6_lwt *)lwt->data; } static inline struct seg6_iptunnel_encap * seg6_encap_lwtunnel(struct lwtunnel_state *lwt) { return seg6_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = { [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static int nla_put_srh(struct sk_buff *skb, int attrtype, struct seg6_iptunnel_encap *tuninfo) { struct seg6_iptunnel_encap *data; struct nlattr *nla; int len; len = SEG6_IPTUN_ENCAP_SIZE(tuninfo); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo, len); return 0; } static void set_tun_src(struct net *net, struct net_device *dev, struct in6_addr *daddr, struct in6_addr *saddr) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct in6_addr *tun_src; rcu_read_lock(); tun_src = rcu_dereference(sdata->tun_src); if (!ipv6_addr_any(tun_src)) { memcpy(saddr, tun_src, sizeof(struct in6_addr)); } else { ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC, saddr); } rcu_read_unlock(); } /* Compute flowlabel for outer IPv6 header */ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, struct ipv6hdr *inner_hdr) { int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel; __be32 flowlabel = 0; u32 hash; if (do_flowlabel > 0) { hash = skb_get_hash(skb); hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { flowlabel = ip6_flowlabel(inner_hdr); } return flowlabel; } static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; __be32 flowlabel; hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* inherit tc, flowlabel and hlim * hlim will be decremented in ip6_forward() afterwards and * decapsulation will overwrite inner hlim with outer hlim */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); /* the control block has been erased, so we have to set the * iif once again. * We read the receiving interface index directly from the * skb->skb_iif as it is done in the IPv4 receiving path (i.e.: * ip_rcv_core(...)). */ IP6CB(skb)->iif = skb->skb_iif; } hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { return __seg6_do_srh_encap(skb, osrh, proto, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; struct ipv6_sr_hdr *isrh; bool skip_srh = false; __be32 flowlabel; int tot_len, err; int red_hdrlen; int tlvs_len; if (first_seg > 0) { red_hdrlen = hdrlen - sizeof(struct in6_addr); } else { /* NOTE: if tag/flags and/or other TLVs are introduced in the * seg6_iptunnel infrastructure, they should be considered when * deciding to skip the SRH. */ skip_srh = !sr_has_hmac(osrh); red_hdrlen = skip_srh ? 0 : hdrlen; } tot_len = red_hdrlen + sizeof(struct ipv6hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* based on seg6_do_srh_encap() */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); IP6CB(skb)->iif = skb->skb_iif; } /* no matter if we have to skip the SRH or not, the first segment * always comes in the pushed IPv6 header. */ hdr->daddr = osrh->segments[first_seg]; if (skip_srh) { hdr->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } /* we cannot skip the SRH, slow path */ hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(struct ipv6hdr); if (unlikely(!first_seg)) { /* this is a very rare case; we have only one SID but * we cannot skip the SRH since we are carrying some * other info. */ memcpy(isrh, osrh, hdrlen); goto srcaddr; } tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr); red_tlv_offset = tlv_offset - sizeof(struct in6_addr); memcpy(isrh, osrh, red_tlv_offset); tlvs_len = hdrlen - tlv_offset; if (unlikely(tlvs_len > 0)) { const void *s = (const void *)osrh + tlv_offset; void *d = (void *)isrh + red_tlv_offset; memcpy(d, s, tlvs_len); } --isrh->first_segment; isrh->hdrlen -= 2; srcaddr: isrh->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif out: hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; int hdrlen, err; hdrlen = (osrh->hdrlen + 1) << 3; err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; oldhdr = ipv6_hdr(skb); skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; isrh->segments[0] = hdr->daddr; hdr->daddr = isrh->segments[isrh->first_segment]; #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); return 0; } static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; int proto, err = 0; tinfo = seg6_encap_lwtunnel(dst->lwtstate); switch (tinfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); if (err) return err; if (skb->protocol == htons(ETH_P_IPV6)) proto = IPPROTO_IPV6; else if (skb->protocol == htons(ETH_P_IP)) proto = IPPROTO_IPIP; else return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, proto, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, proto, cache_dst); if (err) return err; skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); skb->protocol = htons(ETH_P_IPV6); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: if (!skb_mac_header_was_set(skb)) return -EINVAL; if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0) return -ENOMEM; skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); if (err) return err; skb->protocol = htons(ETH_P_IPV6); break; } skb_set_transport_header(skb, sizeof(struct ipv6hdr)); nf_reset_ct(skb); return 0; } /* insert an SRH within an IPv6 packet, just after the IPv6 header */ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) { return __seg6_do_srh_inline(skb, osrh, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_inline); static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static int seg6_input_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; /* We cannot dereference "orig_dst" once ip6_route_input() or * skb_dst_drop() is called. However, in order to detect a dst loop, we * need the address of its lwtstate. So, save the address of lwtstate * now and use it later as a comparison. */ lwtst = orig_dst->lwtstate; slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) { dst_release(dst); goto drop; } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return err; } static int seg6_input_nf(struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); } return -EINVAL; } static int seg6_input(struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_input_nf(skb); return seg6_input_core(dev_net(skb->dev), NULL, skb); } static int seg6_output_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; int err; slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; goto drop; } /* cache only if we don't create a dst reference loop */ if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: dst_release(dst); kfree_skb(skb); return err; } static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); } return -EINVAL; } static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_output_nf(net, sk, skb); return seg6_output_core(net, sk, skb); } static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1]; struct seg6_iptunnel_encap *tuninfo; struct lwtunnel_state *newts; int tuninfo_len, min_size; struct seg6_lwt *slwt; int err; if (family != AF_INET && family != AF_INET6) return -EINVAL; err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla, seg6_iptunnel_policy, extack); if (err < 0) return err; if (!tb[SEG6_IPTUNNEL_SRH]) return -EINVAL; tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]); tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]); /* tuninfo must contain at least the iptunnel encap structure, * the SRH and one segment */ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) + sizeof(struct in6_addr); if (tuninfo_len < min_size) return -EINVAL; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (family != AF_INET6) return -EINVAL; break; case SEG6_IPTUN_MODE_ENCAP: break; case SEG6_IPTUN_MODE_L2ENCAP: break; case SEG6_IPTUN_MODE_ENCAP_RED: break; case SEG6_IPTUN_MODE_L2ENCAP_RED: break; default: return -EINVAL; } /* verify that SRH is consistent */ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) return -EINVAL; newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); if (!newts) return -ENOMEM; slwt = seg6_lwt_lwtunnel(newts); err = dst_cache_init(&slwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); newts->type = LWTUNNEL_ENCAP_SEG6; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; newts->headroom = seg6_lwt_headroom(tuninfo); *ts = newts; return 0; } static void seg6_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); } static int seg6_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo)); } static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a); struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b); int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr); if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops seg6_iptun_ops = { .build_state = seg6_build_state, .destroy_state = seg6_destroy_state, .output = seg6_output, .input = seg6_input, .fill_encap = seg6_fill_encap_info, .get_encap_size = seg6_encap_nlsize, .cmp_encap = seg6_encap_cmp, .owner = THIS_MODULE, }; int __init seg6_iptunnel_init(void) { return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); } void seg6_iptunnel_exit(void) { lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); }
3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { unsigned long *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(unsigned long *word, int bit); int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the target bit, even * if other processes on the same queue are waiting for other bits. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls io_schedule() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The * clearing of the bit must be signalled with wake_up_bit(), often as * clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), except it also takes a timeout * parameter. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal, or %-EAGAIN if the * timeout elapsed. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the address containing the bit waited on * @bit: the bit at that address being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or the error code returned by @action if * that call returned non-zero. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit(), but sets the bit before returning. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or %-EINTR if the process received a signal and * the mode permitted wake up on that signal. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit_lock(), but calls io_schedule() instead * of schedule(). * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * This is similar to wait_on_bit_lock(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or the error code returned by @action if that * call returned non-zero. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define __wait_var_event_io(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /** * wait_var_event - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true, only re-checking when a wake up is * received for the given @var (an arbitrary kernel address which need * not be directly related to the given condition, but usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) /** * wait_var_event_io - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for an IO related @condition to be true, only re-checking when a * wake up is received for the given @var (an arbitrary kernel address * which need not be directly related to the given condition, but * usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * This is similar to wait_var_event(), but calls io_schedule() instead * of schedule(). * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_io(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event_io(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a fatal signal to be received, * only re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal * was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire * @var: the address of variable being waited on * @condition: the condition to wait for * @timeout: maximum time to wait in jiffies * * Wait for a @condition to be true or a timeout to expire, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the timeout expired and the condition was still false, or the * remaining time left in the timeout (but at least 1) if the condition * was found to be true. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a signal to be received, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the condition became true, or %-ERESTARTSYS if a signal was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * wait_var_event_any_lock - wait for a variable to be updated under a lock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the object that is locked to protect updates to the variable * @type: prefix on lock and unlock operations * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. * * Wait for a condition which can only be reliably tested while holding * a lock. The variables assessed in the condition will normal be updated * under the same lock, and the wake up should be signalled with * wake_up_var_locked() under the same lock. * * This is similar to wait_var_event(), but assumes a lock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. The functions used to * unlock and lock the object are constructed by appending _unlock and _lock * to @type. * * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt * the wait according to @state. */ #define wait_var_event_any_lock(var, condition, lock, type, state) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = ___wait_var_event(var, condition, state, 0, 0, \ type ## _unlock(lock); \ schedule(); \ type ## _lock(lock)); \ __ret; \ }) /** * wait_var_event_spinlock - wait for a variable to be updated under a spinlock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the spinlock which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a spinlock. The variables assessed in the condition will normal be updated * under the same spinlock, and the wake up should be signalled with * wake_up_var_locked() under the same spinlock. * * This is similar to wait_var_event(), but assumes a spinlock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_spinlock(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) /** * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for * @mutex: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be * updated under the same mutex, and the wake up should be signalled * with wake_up_var_locked() under the same mutex. * * This is similar to wait_var_event(), but assumes a mutex is held * while calling this function and while updating the variable. * * This must be called while the given mutex is held and the mutex will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_mutex(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) /** * wake_up_var_protected - wake up waiters for a variable asserting that it is safe * @var: the address of the variable being waited on * @cond: the condition which afirms this is safe * * When waking waiters which use wait_var_event_any_lock() the waker must be * holding the reelvant lock to avoid races. This version of wake_up_var() * asserts that the relevant lock is held and so no barrier is needed. * The @cond is only tested when CONFIG_LOCKDEP is enabled. */ #define wake_up_var_protected(var, cond) \ do { \ lockdep_assert(cond); \ wake_up_var(var); \ } while (0) /** * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex * @var: the address of the variable being waited on * @lock: The spinlock or mutex what protects the variable * * Send a wake up for the given variable which should be waited for with * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), * no extra barriers are needed as the locking provides sufficient sequencing. */ #define wake_up_var_locked(var, lock) \ wake_up_var_protected(var, lockdep_is_held(lock)) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address containing the bit being waited on * * The designated bit is cleared and any tasks waiting in wait_on_bit() * or similar will be woken. This call has RELEASE semantics so that * any changes to memory made before this call are guaranteed to be visible * after the corresponding wait_on_bit() completes. */ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } /** * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address of memory containing that bit * * If the bit is set and can be atomically cleared, any tasks waiting in * wait_on_bit() or similar will be woken. This call has the same * complete ordering semantics as test_and_clear_bit(). Any changes to * memory made before this call are guaranteed to be visible after the * corresponding wait_on_bit() completes. * * Returns %true if the bit was successfully set and the wake up was sent. */ static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) { if (!test_and_clear_bit(bit, word)) return false; /* no extra barrier required */ wake_up_bit(word, bit); return true; } /** * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters * @var: the variable to dec and test * * Decrements the atomic variable and if it reaches zero, send a wake_up to any * processes waiting on the variable. * * This function has the same complete ordering semantics as atomic_dec_and_test. * * Returns %true is the variable reaches zero and the wake up was sent. */ static inline bool atomic_dec_and_wake_up(atomic_t *var) { if (!atomic_dec_and_test(var)) return false; /* No extra barrier required */ wake_up_var(var); return true; } /** * store_release_wake_up - update a variable and send a wake_up * @var: the address of the variable to be updated and woken * @val: the value to store in the variable. * * Store the given value in the variable send a wake up to any tasks * waiting on the variable. All necessary barriers are included to ensure * the task calling wait_var_event() sees the new value and all values * written to memory before this call. */ #define store_release_wake_up(var, val) \ do { \ smp_store_release(var, val); \ smp_mb(); \ wake_up_var(var); \ } while (0) #endif /* _LINUX_WAIT_BIT_H */
6 6 6 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, bool bprm_is_check) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (bprm_is_check) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } /* * ima_reset_appraise_flags - reset ima_iint_cache flags * * @digsig: whether to clear/set IMA_DIGSIG flag, tristate values * 0: clear IMA_DIGSIG * 1: set IMA_DIGSIG * -1: don't change IMA_DIGSIG * */ static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig == 1) set_bit(IMA_DIGSIG, &iint->atomic_flags); else if (digsig == 0) clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } else { digsig = -1; } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), -1); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result, digsig = -1; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { if (!strcmp(xattr_name, XATTR_NAME_IMA)) digsig = 0; ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
133 78 78 8 76 85 4 1 54 16 10 80 80 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AF_RXRPC tracepoints * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rxrpc #if !defined(_TRACE_RXRPC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RXRPC_H #include <linux/tracepoint.h> #include <linux/errqueue.h> /* * Declare tracing information enums and their string mappings for display. */ #define rxrpc_abort_reasons \ /* AFS errors */ \ EM(afs_abort_general_error, "afs-error") \ EM(afs_abort_interrupted, "afs-intr") \ EM(afs_abort_oom, "afs-oom") \ EM(afs_abort_op_not_supported, "afs-op-notsupp") \ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ EM(afs_abort_unsupported_sec_class, "afs-unsup-sec-class") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ EM(rxperf_abort_op_not_supported, "rxperf-op-notsupp") \ EM(rxperf_abort_unmarshal_error, "rxperf-unmarshal") \ /* RxKAD security errors */ \ EM(rxkad_abort_1_short_check, "rxkad1-short-check") \ EM(rxkad_abort_1_short_data, "rxkad1-short-data") \ EM(rxkad_abort_1_short_encdata, "rxkad1-short-encdata") \ EM(rxkad_abort_1_short_header, "rxkad1-short-hdr") \ EM(rxkad_abort_2_short_check, "rxkad2-short-check") \ EM(rxkad_abort_2_short_data, "rxkad2-short-data") \ EM(rxkad_abort_2_short_header, "rxkad2-short-hdr") \ EM(rxkad_abort_2_short_len, "rxkad2-short-len") \ EM(rxkad_abort_bad_checksum, "rxkad2-bad-cksum") \ EM(rxkad_abort_chall_key_expired, "rxkad-chall-key-exp") \ EM(rxkad_abort_chall_level, "rxkad-chall-level") \ EM(rxkad_abort_chall_no_key, "rxkad-chall-nokey") \ EM(rxkad_abort_chall_short, "rxkad-chall-short") \ EM(rxkad_abort_chall_version, "rxkad-chall-version") \ EM(rxkad_abort_resp_bad_callid, "rxkad-resp-bad-callid") \ EM(rxkad_abort_resp_bad_checksum, "rxkad-resp-bad-cksum") \ EM(rxkad_abort_resp_bad_param, "rxkad-resp-bad-param") \ EM(rxkad_abort_resp_call_ctr, "rxkad-resp-call-ctr") \ EM(rxkad_abort_resp_call_state, "rxkad-resp-call-state") \ EM(rxkad_abort_resp_key_expired, "rxkad-resp-key-exp") \ EM(rxkad_abort_resp_key_rejected, "rxkad-resp-key-rej") \ EM(rxkad_abort_resp_level, "rxkad-resp-level") \ EM(rxkad_abort_resp_nokey, "rxkad-resp-nokey") \ EM(rxkad_abort_resp_ooseq, "rxkad-resp-ooseq") \ EM(rxkad_abort_resp_short, "rxkad-resp-short") \ EM(rxkad_abort_resp_short_tkt, "rxkad-resp-short-tkt") \ EM(rxkad_abort_resp_tkt_aname, "rxkad-resp-tk-aname") \ EM(rxkad_abort_resp_tkt_expired, "rxkad-resp-tk-exp") \ EM(rxkad_abort_resp_tkt_future, "rxkad-resp-tk-future") \ EM(rxkad_abort_resp_tkt_inst, "rxkad-resp-tk-inst") \ EM(rxkad_abort_resp_tkt_len, "rxkad-resp-tk-len") \ EM(rxkad_abort_resp_tkt_realm, "rxkad-resp-tk-realm") \ EM(rxkad_abort_resp_tkt_short, "rxkad-resp-tk-short") \ EM(rxkad_abort_resp_tkt_sinst, "rxkad-resp-tk-sinst") \ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ /* RxGK security errors */ \ EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ EM(rxgk_abort_2_short_encdata, "rxgk2-short-encdata") \ EM(rxgk_abort_2_short_header, "rxgk2-short-hdr") \ EM(rxgk_abort_bad_key_number, "rxgk-bad-key-num") \ EM(rxgk_abort_chall_key_expired, "rxgk-chall-key-exp") \ EM(rxgk_abort_chall_no_key, "rxgk-chall-nokey") \ EM(rxgk_abort_chall_short, "rxgk-chall-short") \ EM(rxgk_abort_resp_auth_dec, "rxgk-resp-auth-dec") \ EM(rxgk_abort_resp_bad_callid, "rxgk-resp-bad-callid") \ EM(rxgk_abort_resp_bad_nonce, "rxgk-resp-bad-nonce") \ EM(rxgk_abort_resp_bad_param, "rxgk-resp-bad-param") \ EM(rxgk_abort_resp_call_ctr, "rxgk-resp-call-ctr") \ EM(rxgk_abort_resp_call_state, "rxgk-resp-call-state") \ EM(rxgk_abort_resp_internal_error, "rxgk-resp-int-error") \ EM(rxgk_abort_resp_nopkg, "rxgk-resp-nopkg") \ EM(rxgk_abort_resp_short_applen, "rxgk-resp-short-applen") \ EM(rxgk_abort_resp_short_auth, "rxgk-resp-short-auth") \ EM(rxgk_abort_resp_short_call_list, "rxgk-resp-short-callls") \ EM(rxgk_abort_resp_short_packet, "rxgk-resp-short-packet") \ EM(rxgk_abort_resp_short_yfs_klen, "rxgk-resp-short-yfs-klen") \ EM(rxgk_abort_resp_short_yfs_key, "rxgk-resp-short-yfs-key") \ EM(rxgk_abort_resp_short_yfs_tkt, "rxgk-resp-short-yfs-tkt") \ EM(rxgk_abort_resp_tok_dec, "rxgk-resp-tok-dec") \ EM(rxgk_abort_resp_tok_internal_error, "rxgk-resp-tok-int-err") \ EM(rxgk_abort_resp_tok_keyerr, "rxgk-resp-tok-keyerr") \ EM(rxgk_abort_resp_tok_nokey, "rxgk-resp-tok-nokey") \ EM(rxgk_abort_resp_tok_nopkg, "rxgk-resp-tok-nopkg") \ EM(rxgk_abort_resp_tok_short, "rxgk-resp-tok-short") \ EM(rxgk_abort_resp_xdr_align, "rxgk-resp-xdr-align") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ EM(rxrpc_abort_call_sendmsg, "call-sendmsg") \ EM(rxrpc_abort_call_sock_release, "call-sock-rel") \ EM(rxrpc_abort_call_sock_release_tba, "call-sock-rel-tba") \ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ EM(rxrpc_abort_response_sendmsg, "resp-sendmsg") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ EM(rxrpc_badmsg_bad_abort, "bad-abort") \ EM(rxrpc_badmsg_bad_jumbo, "bad-jumbo") \ EM(rxrpc_badmsg_short_ack, "short-ack") \ EM(rxrpc_badmsg_short_ack_trailer, "short-ack-trailer") \ EM(rxrpc_badmsg_short_hdr, "short-hdr") \ EM(rxrpc_badmsg_unsupported_packet, "unsup-pkt") \ EM(rxrpc_badmsg_zero_call, "zero-call") \ EM(rxrpc_badmsg_zero_seq, "zero-seq") \ EM(rxrpc_badmsg_zero_service, "zero-service") \ EM(rxrpc_eproto_ackr_outside_window, "ackr-out-win") \ EM(rxrpc_eproto_ackr_sack_overflow, "ackr-sack-over") \ EM(rxrpc_eproto_ackr_short_sack, "ackr-short-sack") \ EM(rxrpc_eproto_ackr_zero, "ackr-zero") \ EM(rxrpc_eproto_bad_upgrade, "bad-upgrade") \ EM(rxrpc_eproto_data_after_last, "data-after-last") \ EM(rxrpc_eproto_different_last, "diff-last") \ EM(rxrpc_eproto_early_reply, "early-reply") \ EM(rxrpc_eproto_improper_term, "improper-term") \ EM(rxrpc_eproto_no_client_call, "no-cl-call") \ EM(rxrpc_eproto_no_client_conn, "no-cl-conn") \ EM(rxrpc_eproto_no_service_call, "no-sv-call") \ EM(rxrpc_eproto_reupgrade, "re-upgrade") \ EM(rxrpc_eproto_rxnull_challenge, "rxnull-chall") \ EM(rxrpc_eproto_rxnull_response, "rxnull-resp") \ EM(rxrpc_eproto_tx_rot_last, "tx-rot-last") \ EM(rxrpc_eproto_unexpected_ack, "unex-ack") \ EM(rxrpc_eproto_unexpected_ackall, "unex-ackall") \ EM(rxrpc_eproto_unexpected_implicit_end, "unex-impl-end") \ EM(rxrpc_eproto_unexpected_reply, "unex-reply") \ EM(rxrpc_eproto_wrong_security, "wrong-sec") \ EM(rxrpc_recvmsg_excess_data, "recvmsg-excess") \ EM(rxrpc_recvmsg_short_data, "recvmsg-short") \ E_(rxrpc_sendmsg_late_send, "sendmsg-late") #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ EM(rxrpc_call_poke_rx_packet, "Rx-packet") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ EM(rxrpc_call_poke_start, "Start") \ EM(rxrpc_call_poke_timer, "Timer") \ E_(rxrpc_call_poke_timer_now, "Timer-now") #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_post_oob, "GET post-oob ") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_response_rxgk, "NEW resp-rxgk") \ EM(rxrpc_skb_new_response_rxkad, "NEW resp-rxkd") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ EM(rxrpc_skb_put_challenge, "PUT challenge") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ EM(rxrpc_skb_put_oob, "PUT oob ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ EM(rxrpc_skb_put_response, "PUT response ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ EM(rxrpc_skb_see_oob_challenge, "SEE oob-chall") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ EM(rxrpc_local_get_call, "GET call ") \ EM(rxrpc_local_get_client_conn, "GET conn-cln") \ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ EM(rxrpc_local_new, "NEW ") \ EM(rxrpc_local_put_bind, "PUT bind ") \ EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_peer, "PUT peer-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ EM(rxrpc_local_unuse_bind, "UNU bind ") \ EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ EM(rxrpc_local_use_lookup, "USE lookup ") \ E_(rxrpc_local_use_peer_keepalive, "USE peer-kpa") #define rxrpc_peer_traces \ EM(rxrpc_peer_free, "FREE ") \ EM(rxrpc_peer_get_accept, "GET accept ") \ EM(rxrpc_peer_get_application, "GET app ") \ EM(rxrpc_peer_get_bundle, "GET bundle ") \ EM(rxrpc_peer_get_call, "GET call ") \ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ EM(rxrpc_peer_get_input, "GET input ") \ EM(rxrpc_peer_get_input_error, "GET inpt-err") \ EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ EM(rxrpc_peer_new_client, "NEW client ") \ EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ EM(rxrpc_peer_put_application, "PUT app ") \ EM(rxrpc_peer_put_bundle, "PUT bundle ") \ EM(rxrpc_peer_put_call, "PUT call ") \ EM(rxrpc_peer_put_conn, "PUT conn ") \ EM(rxrpc_peer_put_input, "PUT input ") \ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_bundle_traces \ EM(rxrpc_bundle_free, "FREE ") \ EM(rxrpc_bundle_get_client_call, "GET clt-call") \ EM(rxrpc_bundle_get_client_conn, "GET clt-conn") \ EM(rxrpc_bundle_get_service_conn, "GET svc-conn") \ EM(rxrpc_bundle_put_call, "PUT call ") \ EM(rxrpc_bundle_put_conn, "PUT conn ") \ EM(rxrpc_bundle_put_discard, "PUT discard ") \ E_(rxrpc_bundle_new, "NEW ") #define rxrpc_conn_traces \ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ EM(rxrpc_conn_get_challenge_input, "GET inp-chal") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ EM(rxrpc_conn_get_poke_response, "GET response") \ EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ EM(rxrpc_conn_put_challenge_input, "PUT inp-chal") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ EM(rxrpc_conn_put_oob, "PUT oob ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ EM(rxrpc_conn_put_unidle, "PUT unidle ") \ EM(rxrpc_conn_put_work, "PUT work ") \ EM(rxrpc_conn_queue_challenge, "QUE chall ") \ EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \ EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \ EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ E_(rxrpc_conn_see_work, "SEE work ") #define rxrpc_client_traces \ EM(rxrpc_client_activate_chans, "Activa") \ EM(rxrpc_client_alloc, "Alloc ") \ EM(rxrpc_client_chan_activate, "ChActv") \ EM(rxrpc_client_chan_disconnect, "ChDisc") \ EM(rxrpc_client_chan_pass, "ChPass") \ EM(rxrpc_client_cleanup, "Clean ") \ EM(rxrpc_client_discard, "Discar") \ EM(rxrpc_client_exposed, "Expose") \ EM(rxrpc_client_replace, "Replac") \ EM(rxrpc_client_queue_new_call, "Q-Call") \ EM(rxrpc_client_to_active, "->Actv") \ E_(rxrpc_client_to_idle, "->Idle") #define rxrpc_call_traces \ EM(rxrpc_call_get_io_thread, "GET iothread") \ EM(rxrpc_call_get_input, "GET input ") \ EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ EM(rxrpc_call_get_notify_socket, "GET notify ") \ EM(rxrpc_call_get_poke, "GET poke ") \ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_discard_error, "PUT disc-err") \ EM(rxrpc_call_put_io_thread, "PUT iothread") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_recvmsg_peek_nowait, "PUT peek-nwt") \ EM(rxrpc_call_put_release_recvmsg_q, "PUT rls-rcmq") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ EM(rxrpc_call_put_userid, "PUT user-id ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_already_released, "SEE alrdy-rl") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ EM(rxrpc_call_see_discard, "SEE discard ") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ EM(rxrpc_call_see_notify_released, "SEE nfy-rlsd") \ EM(rxrpc_call_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_call_see_recvmsg_requeue, "SEE recv-rqu") \ EM(rxrpc_call_see_recvmsg_requeue_first, "SEE recv-rqF") \ EM(rxrpc_call_see_recvmsg_requeue_move, "SEE recv-rqM") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ EM(rxrpc_txqueue_end, "END") \ EM(rxrpc_txqueue_queue, "QUE") \ EM(rxrpc_txqueue_queue_last, "QLS") \ EM(rxrpc_txqueue_rotate, "ROT") \ EM(rxrpc_txqueue_rotate_last, "RLS") \ E_(rxrpc_txqueue_wait, "WAI") #define rxrpc_txdata_traces \ EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ EM(rxrpc_txdata_new_data, " ") \ EM(rxrpc_txdata_retransmit, " *RETRANS*") \ EM(rxrpc_txdata_tlp_new_data, " *TLP-NEW*") \ E_(rxrpc_txdata_tlp_retransmit, " *TLP-RETRANS*") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ EM(rxrpc_receive_front, "FRN") \ EM(rxrpc_receive_incoming, "INC") \ EM(rxrpc_receive_queue, "QUE") \ EM(rxrpc_receive_queue_last, "QLS") \ EM(rxrpc_receive_queue_oos, "QUO") \ EM(rxrpc_receive_queue_oos_last, "QOL") \ EM(rxrpc_receive_oos, "OOS") \ EM(rxrpc_receive_oos_last, "OSL") \ EM(rxrpc_receive_rotate, "ROT") \ E_(rxrpc_receive_rotate_last, "RLS") #define rxrpc_recvmsg_traces \ EM(rxrpc_recvmsg_cont, "CONT") \ EM(rxrpc_recvmsg_data_return, "DATA") \ EM(rxrpc_recvmsg_dequeue, "DEQU") \ EM(rxrpc_recvmsg_enter, "ENTR") \ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ EM(rxrpc_recvmsg_oobq, "OOBQ") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ EM(rxrpc_recvmsg_to_be_accepted, "TBAC") \ EM(rxrpc_recvmsg_unqueue, "UNQU") \ E_(rxrpc_recvmsg_wait, "WAIT") #define rxrpc_rtt_tx_traces \ EM(rxrpc_rtt_tx_cancel, "CNCE") \ EM(rxrpc_rtt_tx_data, "DATA") \ EM(rxrpc_rtt_tx_no_slot, "FULL") \ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ EM(rxrpc_rtt_rx_data_ack, "DACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ E_(rxrpc_rtt_rx_ping_response, "PONG") #define rxrpc_timer_traces \ EM(rxrpc_timer_trace_delayed_ack, "DelayAck ") \ EM(rxrpc_timer_trace_expect_rx, "ExpectRx ") \ EM(rxrpc_timer_trace_hard, "HardLimit") \ EM(rxrpc_timer_trace_idle, "IdleLimit") \ EM(rxrpc_timer_trace_keepalive, "KeepAlive") \ EM(rxrpc_timer_trace_ping, "DelayPing") \ EM(rxrpc_timer_trace_rack_off, "RACK-OFF ") \ EM(rxrpc_timer_trace_rack_zwp, "RACK-ZWP ") \ EM(rxrpc_timer_trace_rack_reo, "RACK-Reo ") \ EM(rxrpc_timer_trace_rack_tlp_pto, "TLP-PTO ") \ E_(rxrpc_timer_trace_rack_rto, "RTO ") #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_delayed_ack, "DlydAck") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ EM(rxrpc_propose_ack_input_data_hole, "DataInH") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ EM(rxrpc_propose_ack_processing_op, "ProcOp ") \ EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ EM(rxrpc_propose_ack_retransmit, "Retrans") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") #define rxrpc_ca_states \ EM(RXRPC_CA_CONGEST_AVOIDANCE, "CongAvoid") \ EM(RXRPC_CA_FAST_RETRANSMIT, "FastReTx ") \ EM(RXRPC_CA_PACKET_LOSS, "PktLoss ") \ E_(RXRPC_CA_SLOW_START, "SlowStart") #define rxrpc_congest_changes \ EM(rxrpc_cong_begin_retransmission, " Retrans") \ EM(rxrpc_cong_cleared_nacks, " Cleared") \ EM(rxrpc_cong_new_low_nack, " NewLowN") \ EM(rxrpc_cong_no_change, " -") \ EM(rxrpc_cong_progress, " Progres") \ EM(rxrpc_cong_idle_reset, " IdleRes") \ EM(rxrpc_cong_retransmit_again, " ReTxAgn") \ EM(rxrpc_cong_rtt_window_end, " RttWinE") \ E_(rxrpc_cong_saw_nack, " SawNack") #define rxrpc_pkts \ EM(0, "?00") \ EM(RXRPC_PACKET_TYPE_DATA, "DATA") \ EM(RXRPC_PACKET_TYPE_ACK, "ACK") \ EM(RXRPC_PACKET_TYPE_BUSY, "BUSY") \ EM(RXRPC_PACKET_TYPE_ABORT, "ABORT") \ EM(RXRPC_PACKET_TYPE_ACKALL, "ACKALL") \ EM(RXRPC_PACKET_TYPE_CHALLENGE, "CHALL") \ EM(RXRPC_PACKET_TYPE_RESPONSE, "RESP") \ EM(RXRPC_PACKET_TYPE_DEBUG, "DEBUG") \ EM(9, "?09") \ EM(10, "?10") \ EM(11, "?11") \ EM(12, "?12") \ EM(RXRPC_PACKET_TYPE_VERSION, "VERSION") \ EM(14, "?14") \ E_(15, "?15") #define rxrpc_ack_names \ EM(0, "-0-") \ EM(RXRPC_ACK_REQUESTED, "REQ") \ EM(RXRPC_ACK_DUPLICATE, "DUP") \ EM(RXRPC_ACK_OUT_OF_SEQUENCE, "OOS") \ EM(RXRPC_ACK_EXCEEDS_WINDOW, "WIN") \ EM(RXRPC_ACK_NOSPACE, "MEM") \ EM(RXRPC_ACK_PING, "PNG") \ EM(RXRPC_ACK_PING_RESPONSE, "PNR") \ EM(RXRPC_ACK_DELAY, "DLY") \ EM(RXRPC_ACK_IDLE, "IDL") \ E_(RXRPC_ACK__INVALID, "-?-") #define rxrpc_sack_traces \ EM(rxrpc_sack_advance, "ADV") \ EM(rxrpc_sack_fill, "FIL") \ EM(rxrpc_sack_nack, "NAK") \ EM(rxrpc_sack_none, "---") \ E_(rxrpc_sack_oos, "OOS") #define rxrpc_completions \ EM(RXRPC_CALL_SUCCEEDED, "Succeeded") \ EM(RXRPC_CALL_REMOTELY_ABORTED, "RemoteAbort") \ EM(RXRPC_CALL_LOCALLY_ABORTED, "LocalAbort") \ EM(RXRPC_CALL_LOCAL_ERROR, "LocalError") \ E_(RXRPC_CALL_NETWORK_ERROR, "NetError") #define rxrpc_tx_points \ EM(rxrpc_tx_point_call_abort, "CallAbort") \ EM(rxrpc_tx_point_call_ack, "CallAck") \ EM(rxrpc_tx_point_call_data_frag, "CallDataFrag") \ EM(rxrpc_tx_point_call_data_nofrag, "CallDataNofrag") \ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ EM(rxrpc_tx_point_rxgk_challenge, "RxGKChall") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ EM(rxrpc_reqack_app_stall, "APP-STALL ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ EM(rxrpc_reqack_retrans, "RETRANS ") \ EM(rxrpc_reqack_slow_start, "SLOW-START") \ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") /* ---- Must update size of stat_why_req_ack[] if more are added! */ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_alloc_response, "ALLOC RESP ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_response_tx, "PUT RESP TX") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ E_(rxrpc_txbuf_see_send_more, "SEE SEND+ ") #define rxrpc_tq_traces \ EM(rxrpc_tq_alloc, "ALLOC") \ EM(rxrpc_tq_cleaned, "CLEAN") \ EM(rxrpc_tq_decant, "DCNT ") \ EM(rxrpc_tq_decant_advance, "DCNT>") \ EM(rxrpc_tq_queue, "QUEUE") \ EM(rxrpc_tq_queue_dup, "QUE!!") \ EM(rxrpc_tq_rotate, "ROT ") \ EM(rxrpc_tq_rotate_and_free, "ROT-F") \ EM(rxrpc_tq_rotate_and_keep, "ROT-K") \ EM(rxrpc_tq_transmit, "XMIT ") \ E_(rxrpc_tq_transmit_advance, "XMIT>") #define rxrpc_pmtud_reduce_traces \ EM(rxrpc_pmtud_reduce_ack, "Ack ") \ EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ E_(rxrpc_pmtud_reduce_route, "Route") #define rxrpc_rotate_traces \ EM(rxrpc_rotate_trace_hack, "hard-ack") \ EM(rxrpc_rotate_trace_sack, "soft-ack") \ E_(rxrpc_rotate_trace_snak, "soft-nack") #define rxrpc_rack_timer_modes \ EM(RXRPC_CALL_RACKTIMER_OFF, "---") \ EM(RXRPC_CALL_RACKTIMER_RACK_REORDER, "REO") \ EM(RXRPC_CALL_RACKTIMER_TLP_PTO, "TLP") \ E_(RXRPC_CALL_RACKTIMER_RTO, "RTO") #define rxrpc_tlp_probe_traces \ EM(rxrpc_tlp_probe_trace_busy, "busy") \ EM(rxrpc_tlp_probe_trace_transmit_new, "transmit-new") \ E_(rxrpc_tlp_probe_trace_retransmit, "retransmit") #define rxrpc_tlp_ack_traces \ EM(rxrpc_tlp_ack_trace_acked, "acked") \ EM(rxrpc_tlp_ack_trace_dup_acked, "dup-acked") \ EM(rxrpc_tlp_ack_trace_hard_beyond, "hard-beyond") \ EM(rxrpc_tlp_ack_trace_incomplete, "incomplete") \ E_(rxrpc_tlp_ack_trace_new_data, "new-data") /* * Generate enums for tracing information. */ #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #undef EM #undef E_ #define EM(a, b) a, #define E_(a, b) a enum rxrpc_abort_reason { rxrpc_abort_reasons } __mode(byte); enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); enum rxrpc_call_poke_trace { rxrpc_call_poke_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); enum rxrpc_rotate_trace { rxrpc_rotate_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_tlp_ack_trace { rxrpc_tlp_ack_traces } __mode(byte); enum rxrpc_tlp_probe_trace { rxrpc_tlp_probe_traces } __mode(byte); enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); enum rxrpc_txdata_trace { rxrpc_txdata_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* * Export enum symbols via userspace. */ #undef EM #undef E_ #ifndef RXRPC_TRACE_ONLY_DEFINE_ENUMS #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); rxrpc_abort_reasons; rxrpc_bundle_traces; rxrpc_ca_states; rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; rxrpc_conn_traces; rxrpc_local_traces; rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; rxrpc_rack_timer_modes; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; rxrpc_rotate_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; rxrpc_tlp_ack_traces; rxrpc_tlp_probe_traces; rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; rxrpc_txdata_traces; rxrpc_txqueue_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef E_ #define EM(a, b) { a, b }, #define E_(a, b) { a, b } TRACE_EVENT(rxrpc_local, TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, int ref, int usage), TP_ARGS(local_debug_id, op, ref, usage), TP_STRUCT__entry( __field(unsigned int, local) __field(int, op) __field(int, ref) __field(int, usage) ), TP_fast_assign( __entry->local = local_debug_id; __entry->op = op; __entry->ref = ref; __entry->usage = usage; ), TP_printk("L=%08x %s r=%d u=%d", __entry->local, __print_symbolic(__entry->op, rxrpc_local_traces), __entry->ref, __entry->usage) ); TRACE_EVENT(rxrpc_iothread_rx, TP_PROTO(struct rxrpc_local *local, unsigned int nr_rx), TP_ARGS(local, nr_rx), TP_STRUCT__entry( __field(unsigned int, local) __field(unsigned int, nr_rx) ), TP_fast_assign( __entry->local = local->debug_id; __entry->nr_rx = nr_rx; ), TP_printk("L=%08x nrx=%u", __entry->local, __entry->nr_rx) ); TRACE_EVENT(rxrpc_peer, TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), TP_ARGS(peer_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, peer) __field(int, ref) __field(enum rxrpc_peer_trace, why) ), TP_fast_assign( __entry->peer = peer_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("P=%08x %s r=%d", __entry->peer, __print_symbolic(__entry->why, rxrpc_peer_traces), __entry->ref) ); TRACE_EVENT(rxrpc_bundle, TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why), TP_ARGS(bundle_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, bundle) __field(int, ref) __field(int, why) ), TP_fast_assign( __entry->bundle = bundle_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("CB=%08x %s r=%d", __entry->bundle, __print_symbolic(__entry->why, rxrpc_bundle_traces), __entry->ref) ); TRACE_EVENT(rxrpc_conn, TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), TP_ARGS(conn_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, conn) __field(int, ref) __field(int, why) ), TP_fast_assign( __entry->conn = conn_debug_id; __entry->ref = ref; __entry->why = why; ), TP_printk("C=%08x %s r=%d", __entry->conn, __print_symbolic(__entry->why, rxrpc_conn_traces), __entry->ref) ); TRACE_EVENT(rxrpc_client, TP_PROTO(struct rxrpc_connection *conn, int channel, enum rxrpc_client_trace op), TP_ARGS(conn, channel, op), TP_STRUCT__entry( __field(unsigned int, conn) __field(u32, cid) __field(int, channel) __field(int, usage) __field(enum rxrpc_client_trace, op) ), TP_fast_assign( __entry->conn = conn ? conn->debug_id : 0; __entry->channel = channel; __entry->usage = conn ? refcount_read(&conn->ref) : -2; __entry->op = op; __entry->cid = conn ? conn->proto.cid : 0; ), TP_printk("C=%08x h=%2d %s i=%08x u=%d", __entry->conn, __entry->channel, __print_symbolic(__entry->op, rxrpc_client_traces), __entry->cid, __entry->usage) ); TRACE_EVENT(rxrpc_call, TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux, enum rxrpc_call_trace why), TP_ARGS(call_debug_id, ref, aux, why), TP_STRUCT__entry( __field(unsigned int, call) __field(int, ref) __field(int, why) __field(unsigned long, aux) ), TP_fast_assign( __entry->call = call_debug_id; __entry->ref = ref; __entry->why = why; __entry->aux = aux; ), TP_printk("c=%08x %s r=%d a=%lx", __entry->call, __print_symbolic(__entry->why, rxrpc_call_traces), __entry->ref, __entry->aux) ); TRACE_EVENT(rxrpc_skb, TP_PROTO(struct sk_buff *skb, int usage, int mod_count, enum rxrpc_skb_trace why), TP_ARGS(skb, usage, mod_count, why), TP_STRUCT__entry( __field(struct sk_buff *, skb) __field(int, usage) __field(int, mod_count) __field(enum rxrpc_skb_trace, why) ), TP_fast_assign( __entry->skb = skb; __entry->usage = usage; __entry->mod_count = mod_count; __entry->why = why; ), TP_printk("s=%p Rx %s u=%d m=%d", __entry->skb, __print_symbolic(__entry->why, rxrpc_skb_traces), __entry->usage, __entry->mod_count) ); TRACE_EVENT(rxrpc_rx_packet, TP_PROTO(struct rxrpc_skb_priv *sp), TP_ARGS(sp), TP_STRUCT__entry( __field_struct(struct rxrpc_host_header, hdr) ), TP_fast_assign( memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); ), TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s", __entry->hdr.epoch, __entry->hdr.cid, __entry->hdr.callNumber, __entry->hdr.serviceId, __entry->hdr.serial, __entry->hdr.seq, __entry->hdr.securityIndex, __entry->hdr.flags, __print_symbolic(__entry->hdr.type, rxrpc_pkts)) ); TRACE_EVENT(rxrpc_rx_done, TP_PROTO(int result, int abort_code), TP_ARGS(result, abort_code), TP_STRUCT__entry( __field(int, result) __field(int, abort_code) ), TP_fast_assign( __entry->result = result; __entry->abort_code = abort_code; ), TP_printk("r=%d a=%d", __entry->result, __entry->abort_code) ); TRACE_EVENT(rxrpc_abort_call, TP_PROTO(const struct rxrpc_call *call, int abort_code), TP_ARGS(call, abort_code), TP_STRUCT__entry( __field(unsigned int, call_nr) __field(enum rxrpc_abort_reason, why) __field(int, abort_code) __field(int, error) ), TP_fast_assign( __entry->call_nr = call->debug_id; __entry->why = call->send_abort_why; __entry->abort_code = abort_code; __entry->error = call->send_abort_err; ), TP_printk("c=%08x a=%d e=%d %s", __entry->call_nr, __entry->abort_code, __entry->error, __print_symbolic(__entry->why, rxrpc_abort_reasons)) ); TRACE_EVENT(rxrpc_abort, TP_PROTO(unsigned int call_nr, enum rxrpc_abort_reason why, u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error), TP_ARGS(call_nr, why, cid, call_id, seq, abort_code, error), TP_STRUCT__entry( __field(unsigned int, call_nr) __field(enum rxrpc_abort_reason, why) __field(u32, cid) __field(u32, call_id) __field(rxrpc_seq_t, seq) __field(int, abort_code) __field(int, error) ), TP_fast_assign( __entry->call_nr = call_nr; __entry->why = why; __entry->cid = cid; __entry->call_id = call_id; __entry->abort_code = abort_code; __entry->error = error; __entry->seq = seq; ), TP_printk("c=%08x %08x:%08x s=%u a=%d e=%d %s", __entry->call_nr, __entry->cid, __entry->call_id, __entry->seq, __entry->abort_code, __entry->error, __print_symbolic(__entry->why, rxrpc_abort_reasons)) ); TRACE_EVENT(rxrpc_call_complete, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_call_completion, compl) __field(int, error) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->compl = call->completion; __entry->error = call->error; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x %s r=%d ac=%d", __entry->call, __print_symbolic(__entry->compl, rxrpc_completions), __entry->error, __entry->abort_code) ); TRACE_EVENT(rxrpc_txqueue, TP_PROTO(struct rxrpc_call *call, enum rxrpc_txqueue_trace why), TP_ARGS(call, why), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) __field(rxrpc_seq_t, tx_bottom) __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->tx_bottom = call->tx_bottom; __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_top = call->tx_top; __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), TP_printk("c=%08x %s b=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, __entry->acks_hard_ack, __entry->acks_hard_ack - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); TRACE_EVENT(rxrpc_transmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t send_top, int space), TP_ARGS(call, send_top, space), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(u16, space) __field(u16, tx_winsize) __field(u16, cong_cwnd) __field(u16, cong_extra) __field(u16, in_flight) __field(u16, prepared) __field(u16, pmtud_jumbo) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = call->tx_top + 1; __entry->space = space; __entry->tx_winsize = call->tx_winsize; __entry->cong_cwnd = call->cong_cwnd; __entry->cong_extra = call->cong_extra; __entry->prepared = send_top - call->tx_bottom; __entry->in_flight = call->tx_top - call->tx_bottom; __entry->pmtud_jumbo = call->peer->pmtud_jumbo; ), TP_printk("c=%08x q=%08x sp=%u tw=%u cw=%u+%u pr=%u if=%u pj=%u", __entry->call, __entry->seq, __entry->space, __entry->tx_winsize, __entry->cong_cwnd, __entry->cong_extra, __entry->prepared, __entry->in_flight, __entry->pmtud_jumbo) ); TRACE_EVENT(rxrpc_tx_rotate, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_seq_t to), TP_ARGS(call, seq, to), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_seq_t, to) __field(rxrpc_seq_t, top) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = seq; __entry->to = to; __entry->top = call->tx_top; ), TP_printk("c=%08x q=%08x-%08x-%08x", __entry->call, __entry->seq, __entry->to, __entry->top) ); TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), TP_ARGS(call, seq, serial, flags), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(u8, flags) ), TP_fast_assign( __entry->call = call; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; ), TP_printk("c=%08x DATA %08x q=%08x fl=%02x", __entry->call, __entry->serial, __entry->seq, __entry->flags) ); TRACE_EVENT(rxrpc_rx_ack, TP_PROTO(struct rxrpc_call *call, struct rxrpc_skb_priv *sp), TP_ARGS(call, sp), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_serial_t, ack_serial) __field(rxrpc_seq_t, first) __field(rxrpc_seq_t, prev) __field(u8, reason) __field(u8, n_acks) __field(u8, user_status) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = sp->hdr.serial; __entry->user_status = sp->hdr.userStatus; __entry->ack_serial = sp->ack.acked_serial; __entry->first = sp->ack.first_ack; __entry->prev = sp->ack.prev_ack; __entry->reason = sp->ack.reason; __entry->n_acks = sp->ack.nr_acks; ), TP_printk("c=%08x %08x %s r=%08x us=%02x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_serial, __entry->user_status, __entry->first, __entry->prev, __entry->n_acks) ); TRACE_EVENT(rxrpc_rx_abort, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 abort_code), TP_ARGS(call, serial, abort_code), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = serial; __entry->abort_code = abort_code; ), TP_printk("c=%08x ABORT %08x ac=%d", __entry->call, __entry->serial, __entry->abort_code) ); TRACE_EVENT(rxrpc_rx_conn_abort, TP_PROTO(const struct rxrpc_connection *conn, const struct sk_buff *skb), TP_ARGS(conn, skb), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, abort_code) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = rxrpc_skb(skb)->hdr.serial; __entry->abort_code = skb->priority; ), TP_printk("C=%08x ABORT %08x ac=%d", __entry->conn, __entry->serial, __entry->abort_code) ); TRACE_EVENT(rxrpc_tx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce), TP_ARGS(conn, serial, version, nonce), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, version) __field(u32, nonce) __field(u16, service_id) __field(u8, security_ix) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->version = version; __entry->nonce = nonce; __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x", __entry->conn, __entry->serial, __entry->service_id, __entry->security_ix, __entry->version, __entry->nonce) ); TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), TP_ARGS(conn, serial, version, nonce, min_level), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, version) __field(u32, nonce) __field(u32, min_level) __field(u16, service_id) __field(u8, security_ix) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, __entry->service_id, __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) ); TRACE_EVENT(rxrpc_tx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, struct rxrpc_skb_priv *rsp), TP_ARGS(conn, serial, rsp), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(rxrpc_serial_t, challenge) __field(u32, version) __field(u32, kvno) __field(u16, ticket_len) __field(u16, appdata_len) __field(u16, service_id) __field(u8, security_ix) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->challenge = rsp->resp.challenge_serial; __entry->version = rsp->resp.version; __entry->kvno = rsp->resp.kvno; __entry->ticket_len = rsp->resp.ticket_len; __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u", __entry->conn, __entry->serial, __entry->challenge, __entry->service_id, __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) ); TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), TP_ARGS(conn, serial, version, kvno, ticket_len), TP_STRUCT__entry( __field(unsigned int, conn) __field(rxrpc_serial_t, serial) __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) __field(u8, security_ix) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->serial = serial; __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; __entry->security_ix = conn->security_ix; ), TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) ); TRACE_EVENT(rxrpc_rx_rwind_change, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 rwind, bool wake), TP_ARGS(call, serial, rwind, wake), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(u32, rwind) __field(bool, wake) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = serial; __entry->rwind = rwind; __entry->wake = wake; ), TP_printk("c=%08x %08x rw=%u%s", __entry->call, __entry->serial, __entry->rwind, __entry->wake ? " wake" : "") ); TRACE_EVENT(rxrpc_tx_packet, TP_PROTO(unsigned int call_id, struct rxrpc_wire_header *whdr, enum rxrpc_tx_point where), TP_ARGS(call_id, whdr, where), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_tx_point, where) __field_struct(struct rxrpc_wire_header, whdr) ), TP_fast_assign( __entry->call = call_id; memcpy(&__entry->whdr, whdr, sizeof(__entry->whdr)); __entry->where = where; ), TP_printk("c=%08x %08x:%08x:%08x:%04x %08x %08x %02x %02x %s %s", __entry->call, ntohl(__entry->whdr.epoch), ntohl(__entry->whdr.cid), ntohl(__entry->whdr.callNumber), ntohs(__entry->whdr.serviceId), ntohl(__entry->whdr.serial), ntohl(__entry->whdr.seq), __entry->whdr.type, __entry->whdr.flags, __entry->whdr.type <= 15 ? __print_symbolic(__entry->whdr.type, rxrpc_pkts) : "?UNK", __print_symbolic(__entry->where, rxrpc_tx_points)) ); TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_serial_t serial, unsigned int flags, enum rxrpc_txdata_trace trace), TP_ARGS(call, seq, serial, flags, trace), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(u32, cid) __field(u32, call_id) __field(u16, flags) __field(enum rxrpc_txdata_trace, trace) ), TP_fast_assign( __entry->call = call->debug_id; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; __entry->trace = trace; ), TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s", __entry->call, __entry->cid, __entry->call_id, __entry->serial, __entry->seq, __entry->flags & RXRPC_TXBUF_WIRE_FLAGS, __print_symbolic(__entry->trace, rxrpc_txdata_traces)) ); TRACE_EVENT(rxrpc_tx_ack, TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, u8 reason, u8 n_acks, u16 rwind, enum rxrpc_propose_ack_trace trace), TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind, trace), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, ack_first) __field(rxrpc_serial_t, ack_serial) __field(u8, reason) __field(u8, n_acks) __field(u16, rwind) __field(enum rxrpc_propose_ack_trace, trace) ), TP_fast_assign( __entry->call = call; __entry->serial = serial; __entry->ack_first = ack_first; __entry->ack_serial = ack_serial; __entry->reason = reason; __entry->n_acks = n_acks; __entry->rwind = rwind; __entry->trace = trace; ), TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u %s", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_first, __entry->ack_serial, __entry->n_acks, __entry->rwind, __print_symbolic(__entry->trace, rxrpc_propose_ack_traces)) ); TRACE_EVENT(rxrpc_receive, TP_PROTO(struct rxrpc_call *call, enum rxrpc_receive_trace why, rxrpc_serial_t serial, rxrpc_seq_t seq), TP_ARGS(call, why, serial, seq), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_receive_trace, why) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, seq) __field(rxrpc_seq_t, window) __field(rxrpc_seq_t, wtop) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->seq = seq; __entry->window = call->ackr_window; __entry->wtop = call->ackr_wtop; ), TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_receive_traces), __entry->serial, __entry->seq, __entry->window, __entry->wtop) ); TRACE_EVENT(rxrpc_recvmsg, TP_PROTO(unsigned int call_debug_id, enum rxrpc_recvmsg_trace why, int ret), TP_ARGS(call_debug_id, why, ret), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_recvmsg_trace, why) __field(int, ret) ), TP_fast_assign( __entry->call = call_debug_id; __entry->why = why; __entry->ret = ret; ), TP_printk("c=%08x %s ret=%d", __entry->call, __print_symbolic(__entry->why, rxrpc_recvmsg_traces), __entry->ret) ); TRACE_EVENT(rxrpc_recvdata, TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, rxrpc_seq_t seq, unsigned int offset, unsigned int len, int ret), TP_ARGS(call, why, seq, offset, len, ret), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_recvmsg_trace, why) __field(rxrpc_seq_t, seq) __field(unsigned int, offset) __field(unsigned int, len) __field(int, ret) ), TP_fast_assign( __entry->call = call ? call->debug_id : 0; __entry->why = why; __entry->seq = seq; __entry->offset = offset; __entry->len = len; __entry->ret = ret; ), TP_printk("c=%08x %s q=%08x o=%u l=%u ret=%d", __entry->call, __print_symbolic(__entry->why, rxrpc_recvmsg_traces), __entry->seq, __entry->offset, __entry->len, __entry->ret) ); TRACE_EVENT(rxrpc_rtt_tx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_tx_trace why, int slot, rxrpc_serial_t send_serial), TP_ARGS(call, why, slot, send_serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_rtt_tx_trace, why) __field(int, slot) __field(rxrpc_serial_t, send_serial) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->slot = slot; __entry->send_serial = send_serial; ), TP_printk("c=%08x [%d] %s sr=%08x", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_tx_traces), __entry->send_serial) ); TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, u32 rtt, u32 srtt, u32 rto), TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, srtt, rto), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_rtt_rx_trace, why) __field(int, slot) __field(rxrpc_serial_t, send_serial) __field(rxrpc_serial_t, resp_serial) __field(u32, rtt) __field(u32, srtt) __field(u32, rto) __field(u32, min_rtt) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->slot = slot; __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; __entry->srtt = srtt; __entry->rto = rto; __entry->min_rtt = minmax_get(&call->min_rtt) ), TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, __entry->srtt / 8, __entry->rto, __entry->min_rtt) ); TRACE_EVENT(rxrpc_timer_set, TP_PROTO(struct rxrpc_call *call, ktime_t delay, enum rxrpc_timer_trace why), TP_ARGS(call, delay, why), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_timer_trace, why) __field(ktime_t, delay) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->delay = delay; ), TP_printk("c=%08x %s to=%lld", __entry->call, __print_symbolic(__entry->why, rxrpc_timer_traces), ktime_to_us(__entry->delay)) ); TRACE_EVENT(rxrpc_timer_exp, TP_PROTO(struct rxrpc_call *call, ktime_t delay, enum rxrpc_timer_trace why), TP_ARGS(call, delay, why), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_timer_trace, why) __field(ktime_t, delay) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->delay = delay; ), TP_printk("c=%08x %s to=%lld", __entry->call, __print_symbolic(__entry->why, rxrpc_timer_traces), ktime_to_us(__entry->delay)) ); TRACE_EVENT(rxrpc_timer_can, TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why), TP_ARGS(call, why), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_timer_trace, why) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; ), TP_printk("c=%08x %s", __entry->call, __print_symbolic(__entry->why, rxrpc_timer_traces)) ); TRACE_EVENT(rxrpc_timer_restart, TP_PROTO(struct rxrpc_call *call, ktime_t delay, unsigned long delayj), TP_ARGS(call, delay, delayj), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned long, delayj) __field(ktime_t, delay) ), TP_fast_assign( __entry->call = call->debug_id; __entry->delayj = delayj; __entry->delay = delay; ), TP_printk("c=%08x to=%lld j=%ld", __entry->call, ktime_to_us(__entry->delay), __entry->delayj) ); TRACE_EVENT(rxrpc_timer_expired, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) ), TP_fast_assign( __entry->call = call->debug_id; ), TP_printk("c=%08x EXPIRED", __entry->call) ); TRACE_EVENT(rxrpc_rx_lose, TP_PROTO(struct rxrpc_skb_priv *sp), TP_ARGS(sp), TP_STRUCT__entry( __field_struct(struct rxrpc_host_header, hdr) ), TP_fast_assign( memcpy(&__entry->hdr, &sp->hdr, sizeof(__entry->hdr)); ), TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*", __entry->hdr.epoch, __entry->hdr.cid, __entry->hdr.callNumber, __entry->hdr.serviceId, __entry->hdr.serial, __entry->hdr.seq, __entry->hdr.type, __entry->hdr.flags, __entry->hdr.type <= 15 ? __print_symbolic(__entry->hdr.type, rxrpc_pkts) : "?UNK") ); TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial), TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; ), TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial) ); TRACE_EVENT(rxrpc_send_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial), TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; ), TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial) ); TRACE_EVENT(rxrpc_drop_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, u8 ack_reason, rxrpc_serial_t serial, bool nobuf), TP_ARGS(call, why, ack_reason, serial, nobuf), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_propose_ack_trace, why) __field(rxrpc_serial_t, serial) __field(u8, ack_reason) __field(bool, nobuf) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; __entry->nobuf = nobuf; ), TP_printk("c=%08x %s %s r=%08x nbf=%u", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial, __entry->nobuf) ); TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, struct rxrpc_send_data_req *req, struct rxrpc_txbuf *txb), TP_ARGS(call, req, txb), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = req->tq->qbase; __entry->seq = req->seq; __entry->serial = txb->serial; ), TP_printk("c=%08x tq=%x q=%x r=%x", __entry->call, __entry->qbase, __entry->seq, __entry->serial) ); TRACE_EVENT(rxrpc_congest, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), TP_ARGS(call, summary), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_ca_state, ca_state) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) __field(u16, nr_sacks) __field(u16, nr_snacks) __field(u16, cwnd) __field(u16, ssthresh) __field(u16, cumul_acks) __field(u16, dup_acks) __field_struct(struct rxrpc_ack_summary, sum) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ca_state = call->cong_ca_state; __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->nr_sacks = call->acks_nr_sacks; __entry->nr_snacks = call->acks_nr_snacks; __entry->cwnd = call->cong_cwnd; __entry->ssthresh = call->cong_ssthresh; __entry->cumul_acks = call->cong_cumul_acks; __entry->dup_acks = call->cong_dup_acks; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->sum.acked_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->ssthresh, __entry->nr_sacks, __entry->sum.nr_new_sacks, __entry->nr_snacks, __entry->sum.nr_new_snacks, __entry->sum.nr_new_hacks, __entry->top - __entry->hard_ack, __entry->cumul_acks, __entry->dup_acks, __entry->lowest_nak, __entry->sum.new_low_snack ? "!" : "", __print_symbolic(__entry->sum.change, rxrpc_congest_changes), __entry->sum.retrans_timeo ? " rTxTo" : "") ); TRACE_EVENT(rxrpc_reset_cwnd, TP_PROTO(struct rxrpc_call *call, ktime_t since_last_tx, ktime_t rtt), TP_ARGS(call, since_last_tx, rtt), TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_ca_state, ca_state) __field(unsigned short, cwnd) __field(unsigned short, extra) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prepared) __field(ktime_t, since_last_tx) __field(ktime_t, rtt) __field(bool, has_data) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ca_state = call->cong_ca_state; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = since_last_tx; __entry->rtt = rtt; __entry->has_data = call->tx_bottom != call->tx_top; ), TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu/%llu d=%u", __entry->call, __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, ktime_to_us(__entry->since_last_tx), ktime_to_us(__entry->rtt), __entry->has_data) ); TRACE_EVENT(rxrpc_disconnect_call, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); TRACE_EVENT(rxrpc_improper_term, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(u32, abort_code) ), TP_fast_assign( __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); TRACE_EVENT(rxrpc_connect_call, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned long, user_call_ID) __field(u32, cid) __field(u32, call_id) __field_struct(struct sockaddr_rxrpc, srx) ), TP_fast_assign( __entry->call = call->debug_id; __entry->user_call_ID = call->user_call_ID; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->srx = call->dest_srx; ), TP_printk("c=%08x u=%p %08x:%08x dst=%pISp", __entry->call, (void *)__entry->user_call_ID, __entry->cid, __entry->call_id, &__entry->srx.transport) ); TRACE_EVENT(rxrpc_apply_acks, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq), TP_ARGS(call, tq), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned int, nr_rep) __field(rxrpc_seq_t, qbase) __field(unsigned long, acks) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = tq->qbase; __entry->acks = tq->segment_acked; __entry->nr_rep = tq->nr_reported_acks; ), TP_printk("c=%08x tq=%x acks=%016lx rep=%u", __entry->call, __entry->qbase, __entry->acks, __entry->nr_rep) ); TRACE_EVENT(rxrpc_resend, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t ack_serial), TP_ARGS(call, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, seq) __field(rxrpc_seq_t, transmitted) __field(rxrpc_serial_t, ack_serial) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = ack_serial; ), TP_printk("c=%08x r=%x q=%x tq=%x", __entry->call, __entry->ack_serial, __entry->seq, __entry->transmitted) ); TRACE_EVENT(rxrpc_resend_lost, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned long lost), TP_ARGS(call, tq, lost), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, qbase) __field(u8, nr_rep) __field(unsigned long, lost) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = tq->qbase; __entry->nr_rep = tq->nr_reported_acks; __entry->lost = lost; ), TP_printk("c=%08x tq=%x lost=%016lx nr=%u", __entry->call, __entry->qbase, __entry->lost, __entry->nr_rep) ); TRACE_EVENT(rxrpc_rotate, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, enum rxrpc_rotate_trace trace), TP_ARGS(call, tq, summary, seq, trace), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, qbase) __field(rxrpc_seq_t, seq) __field(unsigned int, nr_rep) __field(enum rxrpc_rotate_trace, trace) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = tq->qbase; __entry->seq = seq; __entry->nr_rep = tq->nr_reported_acks; __entry->trace = trace; ), TP_printk("c=%08x tq=%x q=%x nr=%x %s", __entry->call, __entry->qbase, __entry->seq, __entry->nr_rep, __print_symbolic(__entry->trace, rxrpc_rotate_traces)) ); TRACE_EVENT(rxrpc_rx_icmp, TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee, struct sockaddr_rxrpc *srx), TP_ARGS(peer, ee, srx), TP_STRUCT__entry( __field(unsigned int, peer) __field_struct(struct sock_extended_err, ee) __field_struct(struct sockaddr_rxrpc, srx) ), TP_fast_assign( __entry->peer = peer->debug_id; memcpy(&__entry->ee, ee, sizeof(__entry->ee)); memcpy(&__entry->srx, srx, sizeof(__entry->srx)); ), TP_printk("P=%08x o=%u t=%u c=%u i=%u d=%u e=%d %pISp", __entry->peer, __entry->ee.ee_origin, __entry->ee.ee_type, __entry->ee.ee_code, __entry->ee.ee_info, __entry->ee.ee_data, __entry->ee.ee_errno, &__entry->srx.transport) ); TRACE_EVENT(rxrpc_tx_fail, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, int ret, enum rxrpc_tx_point where), TP_ARGS(debug_id, serial, ret, where), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) __field(int, ret) __field(enum rxrpc_tx_point, where) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; __entry->ret = ret; __entry->where = where; ), TP_printk("c=%08x r=%x ret=%d %s", __entry->debug_id, __entry->serial, __entry->ret, __print_symbolic(__entry->where, rxrpc_tx_points)) ); TRACE_EVENT(rxrpc_call_reset, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(u32, cid) __field(u32, call_id) __field(rxrpc_serial_t, call_serial) __field(rxrpc_serial_t, conn_serial) __field(rxrpc_seq_t, tx_seq) __field(rxrpc_seq_t, rx_seq) ), TP_fast_assign( __entry->debug_id = call->debug_id; __entry->cid = call->cid; __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", __entry->debug_id, __entry->cid, __entry->call_id, __entry->call_serial, __entry->conn_serial, __entry->tx_seq, __entry->rx_seq) ); TRACE_EVENT(rxrpc_notify_socket, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial), TP_ARGS(debug_id, serial), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; ), TP_printk("c=%08x r=%08x", __entry->debug_id, __entry->serial) ); TRACE_EVENT(rxrpc_rx_discard_ack, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt), TP_ARGS(call, serial, hard_ack, prev_pkt), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prev_pkt) __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, acks_prev_seq) ), TP_fast_assign( __entry->debug_id = call->debug_id; __entry->serial = serial; __entry->hard_ack = hard_ack; __entry->prev_pkt = prev_pkt; __entry->acks_hard_ack = call->acks_hard_ack; __entry->acks_prev_seq = call->acks_prev_seq; ), TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", __entry->debug_id, __entry->serial, __entry->hard_ack, __entry->acks_hard_ack, __entry->prev_pkt, __entry->acks_prev_seq) ); TRACE_EVENT(rxrpc_req_ack, TP_PROTO(unsigned int call_debug_id, rxrpc_seq_t seq, enum rxrpc_req_ack_trace why), TP_ARGS(call_debug_id, seq, why), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(enum rxrpc_req_ack_trace, why) ), TP_fast_assign( __entry->call_debug_id = call_debug_id; __entry->seq = seq; __entry->why = why; ), TP_printk("c=%08x q=%08x REQ-%s", __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->why, rxrpc_req_ack_traces)) ); TRACE_EVENT(rxrpc_txbuf, TP_PROTO(unsigned int debug_id, unsigned int call_debug_id, rxrpc_seq_t seq, int ref, enum rxrpc_txbuf_trace what), TP_ARGS(debug_id, call_debug_id, seq, ref, what), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(int, ref) __field(enum rxrpc_txbuf_trace, what) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->call_debug_id = call_debug_id; __entry->seq = seq; __entry->ref = ref; __entry->what = what; ), TP_printk("B=%08x c=%08x q=%08x %s r=%d", __entry->debug_id, __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->what, rxrpc_txbuf_traces), __entry->ref) ); TRACE_EVENT(rxrpc_tq, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, rxrpc_seq_t seq, enum rxrpc_tq_trace trace), TP_ARGS(call, tq, seq, trace), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, qbase) __field(rxrpc_seq_t, seq) __field(enum rxrpc_tq_trace, trace) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; __entry->qbase = tq ? tq->qbase : call->tx_qbase; __entry->seq = seq; __entry->trace = trace; ), TP_printk("c=%08x bq=%08x q=%08x %s", __entry->call_debug_id, __entry->qbase, __entry->seq, __print_symbolic(__entry->trace, rxrpc_tq_traces)) ); TRACE_EVENT(rxrpc_poke_call, TP_PROTO(struct rxrpc_call *call, bool busy, enum rxrpc_call_poke_trace what), TP_ARGS(call, busy, what), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(bool, busy) __field(enum rxrpc_call_poke_trace, what) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; __entry->busy = busy; __entry->what = what; ), TP_printk("c=%08x %s%s", __entry->call_debug_id, __print_symbolic(__entry->what, rxrpc_call_poke_traces), __entry->busy ? "!" : "") ); TRACE_EVENT(rxrpc_call_poked, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call_debug_id) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; ), TP_printk("c=%08x", __entry->call_debug_id) ); TRACE_EVENT(rxrpc_sack, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, unsigned int sack, enum rxrpc_sack_trace what), TP_ARGS(call, seq, sack, what), TP_STRUCT__entry( __field(unsigned int, call_debug_id) __field(rxrpc_seq_t, seq) __field(unsigned int, sack) __field(enum rxrpc_sack_trace, what) ), TP_fast_assign( __entry->call_debug_id = call->debug_id; __entry->seq = seq; __entry->sack = sack; __entry->what = what; ), TP_printk("c=%08x q=%08x %s k=%x", __entry->call_debug_id, __entry->seq, __print_symbolic(__entry->what, rxrpc_sack_traces), __entry->sack) ); TRACE_EVENT(rxrpc_pmtud_tx, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, peer_debug_id) __field(unsigned int, call_debug_id) __field(rxrpc_serial_t, ping_serial) __field(unsigned short, pmtud_trial) __field(unsigned short, pmtud_good) __field(unsigned short, pmtud_bad) ), TP_fast_assign( __entry->peer_debug_id = call->peer->debug_id; __entry->call_debug_id = call->debug_id; __entry->ping_serial = call->conn->pmtud_probe; __entry->pmtud_trial = call->peer->pmtud_trial; __entry->pmtud_good = call->peer->pmtud_good; __entry->pmtud_bad = call->peer->pmtud_bad; ), TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", __entry->peer_debug_id, __entry->call_debug_id, __entry->ping_serial, __entry->pmtud_good, __entry->pmtud_trial, __entry->pmtud_bad) ); TRACE_EVENT(rxrpc_pmtud_rx, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), TP_ARGS(conn, resp_serial), TP_STRUCT__entry( __field(unsigned int, peer_debug_id) __field(unsigned int, call_debug_id) __field(rxrpc_serial_t, ping_serial) __field(rxrpc_serial_t, resp_serial) __field(unsigned short, max_data) __field(u8, jumbo_max) ), TP_fast_assign( __entry->peer_debug_id = conn->peer->debug_id; __entry->call_debug_id = conn->pmtud_call; __entry->ping_serial = conn->pmtud_probe; __entry->resp_serial = resp_serial; __entry->max_data = conn->peer->max_data; __entry->jumbo_max = conn->peer->pmtud_jumbo; ), TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", __entry->peer_debug_id, __entry->call_debug_id, __entry->ping_serial, __entry->resp_serial, __entry->max_data, __entry->jumbo_max) ); TRACE_EVENT(rxrpc_pmtud_lost, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), TP_ARGS(conn, resp_serial), TP_STRUCT__entry( __field(unsigned int, peer_debug_id) __field(unsigned int, call_debug_id) __field(rxrpc_serial_t, ping_serial) __field(rxrpc_serial_t, resp_serial) ), TP_fast_assign( __entry->peer_debug_id = conn->peer->debug_id; __entry->call_debug_id = conn->pmtud_call; __entry->ping_serial = conn->pmtud_probe; __entry->resp_serial = resp_serial; ), TP_printk("P=%08x c=%08x pr=%08x rr=%08x", __entry->peer_debug_id, __entry->call_debug_id, __entry->ping_serial, __entry->resp_serial) ); TRACE_EVENT(rxrpc_pmtud_reduce, TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), TP_ARGS(peer, serial, max_data, reason), TP_STRUCT__entry( __field(unsigned int, peer_debug_id) __field(rxrpc_serial_t, serial) __field(unsigned int, max_data) __field(enum rxrpc_pmtud_reduce_trace, reason) ), TP_fast_assign( __entry->peer_debug_id = peer->debug_id; __entry->serial = serial; __entry->max_data = max_data; __entry->reason = reason; ), TP_printk("P=%08x %s r=%08x m=%u", __entry->peer_debug_id, __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), __entry->serial, __entry->max_data) ); TRACE_EVENT(rxrpc_rack, TP_PROTO(struct rxrpc_call *call, ktime_t timo), TP_ARGS(call, timo), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, ack_serial) __field(rxrpc_seq_t, seq) __field(enum rxrpc_rack_timer_mode, mode) __field(unsigned short, nr_sent) __field(unsigned short, nr_lost) __field(unsigned short, nr_resent) __field(unsigned short, nr_sacked) __field(ktime_t, timo) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ack_serial = call->rx_serial; __entry->seq = call->rack_end_seq; __entry->mode = call->rack_timer_mode; __entry->nr_sent = call->tx_nr_sent; __entry->nr_lost = call->tx_nr_lost; __entry->nr_resent = call->tx_nr_resent; __entry->nr_sacked = call->acks_nr_sacks; __entry->timo = timo; ), TP_printk("c=%08x r=%08x q=%08x %s slrs=%u,%u,%u,%u t=%lld", __entry->call, __entry->ack_serial, __entry->seq, __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), __entry->nr_sent, __entry->nr_lost, __entry->nr_resent, __entry->nr_sacked, ktime_to_us(__entry->timo)) ); TRACE_EVENT(rxrpc_rack_update, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), TP_ARGS(call, summary), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, ack_serial) __field(rxrpc_seq_t, seq) __field(int, xmit_ts) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ack_serial = call->rx_serial; __entry->seq = call->rack_end_seq; __entry->xmit_ts = ktime_sub(call->acks_latest_ts, call->rack_xmit_ts); ), TP_printk("c=%08x r=%08x q=%08x xt=%lld", __entry->call, __entry->ack_serial, __entry->seq, ktime_to_us(__entry->xmit_ts)) ); TRACE_EVENT(rxrpc_rack_scan_loss, TP_PROTO(struct rxrpc_call *call), TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call) __field(ktime_t, rack_rtt) __field(ktime_t, rack_reo_wnd) ), TP_fast_assign( __entry->call = call->debug_id; __entry->rack_rtt = call->rack_rtt; __entry->rack_reo_wnd = call->rack_reo_wnd; ), TP_printk("c=%08x rtt=%lld reow=%lld", __entry->call, ktime_to_us(__entry->rack_rtt), ktime_to_us(__entry->rack_reo_wnd)) ); TRACE_EVENT(rxrpc_rack_scan_loss_tq, TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq, unsigned long nacks), TP_ARGS(call, tq, nacks), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, qbase) __field(unsigned long, nacks) __field(unsigned long, lost) __field(unsigned long, retrans) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = tq->qbase; __entry->nacks = nacks; __entry->lost = tq->segment_lost; __entry->retrans = tq->segment_retransmitted; ), TP_printk("c=%08x q=%08x n=%lx l=%lx r=%lx", __entry->call, __entry->qbase, __entry->nacks, __entry->lost, __entry->retrans) ); TRACE_EVENT(rxrpc_rack_detect_loss, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, rxrpc_seq_t seq), TP_ARGS(call, summary, seq), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, ack_serial) __field(rxrpc_seq_t, seq) ), TP_fast_assign( __entry->call = call->debug_id; __entry->ack_serial = call->rx_serial; __entry->seq = seq; ), TP_printk("c=%08x r=%08x q=%08x", __entry->call, __entry->ack_serial, __entry->seq) ); TRACE_EVENT(rxrpc_rack_mark_loss_tq, TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq), TP_ARGS(call, tq), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_seq_t, qbase) __field(rxrpc_seq_t, trans) __field(unsigned long, acked) __field(unsigned long, lost) __field(unsigned long, retrans) ), TP_fast_assign( __entry->call = call->debug_id; __entry->qbase = tq->qbase; __entry->trans = call->tx_transmitted; __entry->acked = tq->segment_acked; __entry->lost = tq->segment_lost; __entry->retrans = tq->segment_retransmitted; ), TP_printk("c=%08x tq=%08x txq=%08x a=%lx l=%lx r=%lx", __entry->call, __entry->qbase, __entry->trans, __entry->acked, __entry->lost, __entry->retrans) ); TRACE_EVENT(rxrpc_tlp_probe, TP_PROTO(struct rxrpc_call *call, enum rxrpc_tlp_probe_trace trace), TP_ARGS(call, trace), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, seq) __field(enum rxrpc_tlp_probe_trace, trace) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = call->tlp_serial; __entry->seq = call->tlp_seq; __entry->trace = trace; ), TP_printk("c=%08x r=%08x pq=%08x %s", __entry->call, __entry->serial, __entry->seq, __print_symbolic(__entry->trace, rxrpc_tlp_probe_traces)) ); TRACE_EVENT(rxrpc_tlp_ack, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, enum rxrpc_tlp_ack_trace trace), TP_ARGS(call, summary, trace), TP_STRUCT__entry( __field(unsigned int, call) __field(rxrpc_serial_t, serial) __field(rxrpc_seq_t, tlp_seq) __field(rxrpc_seq_t, hard_ack) __field(enum rxrpc_tlp_ack_trace, trace) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = call->tlp_serial; __entry->tlp_seq = call->tlp_seq; __entry->hard_ack = call->acks_hard_ack; __entry->trace = trace; ), TP_printk("c=%08x r=%08x pq=%08x hq=%08x %s", __entry->call, __entry->serial, __entry->tlp_seq, __entry->hard_ack, __print_symbolic(__entry->trace, rxrpc_tlp_ack_traces)) ); TRACE_EVENT(rxrpc_rack_timer, TP_PROTO(struct rxrpc_call *call, ktime_t delay, bool exp), TP_ARGS(call, delay, exp), TP_STRUCT__entry( __field(unsigned int, call) __field(bool, exp) __field(enum rxrpc_rack_timer_mode, mode) __field(ktime_t, delay) ), TP_fast_assign( __entry->call = call->debug_id; __entry->exp = exp; __entry->mode = call->rack_timer_mode; __entry->delay = delay; ), TP_printk("c=%08x %s %s to=%lld", __entry->call, __entry->exp ? "Exp" : "Set", __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), ktime_to_us(__entry->delay)) ); TRACE_EVENT(rxrpc_rxgk_rekey, TP_PROTO(struct rxrpc_connection *conn, unsigned int current_key, unsigned int requested_key), TP_ARGS(conn, current_key, requested_key), TP_STRUCT__entry( __field(unsigned int, conn) __field(unsigned int, current_key) __field(unsigned int, requested_key) ), TP_fast_assign( __entry->conn = conn->debug_id; __entry->current_key = current_key; __entry->requested_key = requested_key; ), TP_printk("C=%08x cur=%x req=%x", __entry->conn, __entry->current_key, __entry->requested_key) ); #undef EM #undef E_ #endif /* RXRPC_TRACE_ONLY_DEFINE_ENUMS */ #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ /* * Management Component Transport Protocol (MCTP) * * Copyright (c) 2021 Code Construct * Copyright (c) 2021 Google */ #ifndef __NET_MCTP_H #define __NET_MCTP_H #include <linux/bits.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> /* MCTP packet definitions */ struct mctp_hdr { u8 ver; u8 dest; u8 src; u8 flags_seq_tag; }; #define MCTP_VER_MIN 1 #define MCTP_VER_MAX 1 /* Definitions for flags_seq_tag field */ #define MCTP_HDR_FLAG_SOM BIT(7) #define MCTP_HDR_FLAG_EOM BIT(6) #define MCTP_HDR_FLAG_TO BIT(3) #define MCTP_HDR_FLAGS GENMASK(5, 3) #define MCTP_HDR_SEQ_SHIFT 4 #define MCTP_HDR_SEQ_MASK GENMASK(1, 0) #define MCTP_HDR_TAG_SHIFT 0 #define MCTP_HDR_TAG_MASK GENMASK(2, 0) #define MCTP_INITIAL_DEFAULT_NET 1 static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } static inline bool mctp_address_broadcast(mctp_eid_t eid) { return eid == 255; } static inline bool mctp_address_null(mctp_eid_t eid) { return eid == 0; } static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; } static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); } /* socket implementation */ struct mctp_sock { struct sock sk; /* bind() params */ unsigned int bind_net; mctp_eid_t bind_local_addr; mctp_eid_t bind_peer_addr; unsigned int bind_peer_net; bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ bool addr_ext; /* list of mctp_sk_key, for incoming tag lookup. updates protected * by sk->net->keys_lock */ struct hlist_head keys; /* mechanism for expiring allocated keys; will release an allocated * tag, and any netdev state for a request/response pairing */ struct timer_list key_expiry; }; /* Key for matching incoming packets to sockets or reassembly contexts. * Packets are matched on (peer EID, local EID, tag). * * Lifetime / locking requirements: * * - individual key data (ie, the struct itself) is protected by key->lock; * changes must be made with that lock held. * * - the lookup fields: peer_addr, local_addr and tag are set before the * key is added to lookup lists, and never updated. * * - A ref to the key must be held (throuh key->refs) if a pointer to the * key is to be accessed after key->lock is released. * * - a mctp_sk_key contains a reference to a struct sock; this is valid * for the life of the key. On sock destruction (through unhash), the key is * removed from lists (see below), and marked invalid. * * - these mctp_sk_keys appear on two lists: * 1) the struct mctp_sock->keys list * 2) the struct netns_mctp->keys list * * presences on these lists requires a (single) refcount to be held; both * lists are updated as a single operation. * * Updates and lookups in either list are performed under the * netns_mctp->keys lock. Lookup functions will need to lock the key and * take a reference before unlocking the keys_lock. Consequently, the list's * keys_lock *cannot* be acquired with the individual key->lock held. * * - a key may have a sk_buff attached as part of an in-progress message * reassembly (->reasm_head). The reasm data is protected by the individual * key->lock. * * - there are two destruction paths for a mctp_sk_key: * * - through socket unhash (see mctp_sk_unhash). This performs the list * removal under keys_lock. * * - where a key is established to receive a reply message: after receiving * the (complete) reply, or during reassembly errors. Here, we clean up * the reassembly context (marking reasm_dead, to prevent another from * starting), and remove the socket from the netns & socket lists. * * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { unsigned int net; mctp_eid_t peer_addr; mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ struct sock *sk; /* routing lookup list */ struct hlist_node hlist; /* per-socket list */ struct hlist_node sklist; /* lock protects against concurrent updates to the reassembly and * expiry data below. */ spinlock_t lock; /* Keys are referenced during the output path, which may sleep */ refcount_t refs; /* incoming fragment reassembly context */ struct sk_buff *reasm_head; struct sk_buff **reasm_tailp; bool reasm_dead; u8 last_seq; /* key validity */ bool valid; /* expiry timeout; valid (above) cleared on expiry */ unsigned long expiry; /* free to use for device flow state tracking. Initialised to * zero on initial key creation */ unsigned long dev_flow_state; struct mctp_dev *dev; /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire * automatically on timeout or response, instead SIOCMCTPDROPTAG * is used. */ bool manual_alloc; }; struct mctp_skb_cb { unsigned int magic; unsigned int net; /* fields below provide extended addressing for ingress to recvmsg() */ int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; /* skb control-block accessors with a little extra debugging for initial * development. * * TODO: remove checks & mctp_skb_cb->magic; replace callers of __mctp_cb * with mctp_cb(). * * __mctp_cb() is only for the initial ingress code; we should see ->magic set * at all times after this. */ static inline struct mctp_skb_cb *__mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; cb->magic = 0x4d435450; return cb; } static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, * indicating the flow to the device driver. */ struct mctp_flow { struct mctp_sk_key *key; }; struct mctp_dst; /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for * removed routes. We hold a reference to the netdev; routes need to be * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; unsigned int mtu; enum { MCTP_ROUTE_DIRECT, MCTP_ROUTE_GATEWAY, } dst_type; union { struct mctp_dev *dev; struct mctp_fq_addr gateway; }; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; refcount_t refs; struct rcu_head rcu; }; /* Route lookup result: dst. Represents the results of a routing decision, * but is only held over the individual routing operation. * * Will typically be stored on the caller stack, and must be released after * usage. */ struct mctp_dst { struct mctp_dev *dev; unsigned int mtu; mctp_eid_t nexthop; /* set for direct addressing */ unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); }; int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, unsigned char halen, const unsigned char *haddr); /* route interfaces */ int mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr, struct mctp_dst *dst); void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, unsigned int netid, mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); int mctp_default_net_set(struct net *net, unsigned int index); int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr); int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr); void mctp_route_remove_dev(struct mctp_dev *mdev); /* neighbour definitions */ enum mctp_neigh_source { MCTP_NEIGH_STATIC, MCTP_NEIGH_DISCOVER, }; struct mctp_neigh { struct mctp_dev *dev; mctp_eid_t eid; enum mctp_neigh_source source; unsigned char ha[MAX_ADDR_LEN]; struct list_head list; struct rcu_head rcu; }; int mctp_neigh_init(void); void mctp_neigh_exit(void); // ret_hwaddr may be NULL, otherwise must have space for MAX_ADDR_LEN int mctp_neigh_lookup(struct mctp_dev *dev, mctp_eid_t eid, void *ret_hwaddr); void mctp_neigh_remove_dev(struct mctp_dev *mdev); int mctp_routes_init(void); void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); /* MCTP IDs and Codes from DMTF specification * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf */ enum mctp_phys_binding { MCTP_PHYS_BINDING_UNSPEC = 0x00, MCTP_PHYS_BINDING_SMBUS = 0x01, MCTP_PHYS_BINDING_PCIE_VDM = 0x02, MCTP_PHYS_BINDING_USB = 0x03, MCTP_PHYS_BINDING_KCS = 0x04, MCTP_PHYS_BINDING_SERIAL = 0x05, MCTP_PHYS_BINDING_I3C = 0x06, MCTP_PHYS_BINDING_MMBI = 0x07, MCTP_PHYS_BINDING_PCC = 0x08, MCTP_PHYS_BINDING_UCIE = 0x09, MCTP_PHYS_BINDING_VENDOR = 0xFF, }; #endif /* __NET_MCTP_H */
13279 13296 11611 11624 831 831 11 11 1174 11 11 11 562 378 377 562 561 11 560 558 550 9 560 557 11 561 562 562 562 9 11 11 561 558 562 559 11 561 602 601 602 591 567 44 589 592 11 589 11 32 32 589 591 11 591 590 24 242 577 4 577 591 587 258 259 136 256 3 3 588 577 59 59 588 404 1003 30 1159 1163 205 1161 14 1161 54 55 1162 975 158 104 1163 977 2 832 832 810 232 232 231 2 2 1 2 832 830 121 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 // SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett */ #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/rhashtable.h> #include <linux/err.h> #include <linux/export.h> #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U union nested_table { union nested_table __rcu *table; struct rhash_lock_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { if (!debug_locks) return 1; if (unlikely(tbl->nest)) return 1; return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) #endif static inline union nested_table *nested_table_top( const struct bucket_table *tbl) { /* The top-level bucket entry does not need RCU protection * because it's set at the same time as tbl->nest. */ return (void *)rcu_dereference_protected(tbl->buckets[0], 1); } static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; ntbl = rcu_dereference_protected(ntbl->table, 1); if (!ntbl) return; if (size > len) { size >>= shift; for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); } kfree(ntbl); } static void nested_bucket_table_free(const struct bucket_table *tbl) { unsigned int size = tbl->size >> tbl->nest; unsigned int len = 1 << tbl->nest; union nested_table *ntbl; unsigned int i; ntbl = nested_table_top(tbl); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); kfree(ntbl); } static void bucket_table_free(const struct bucket_table *tbl) { if (tbl->nest) nested_bucket_table_free(tbl); kvfree(tbl); } static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); } static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, bool leaf) { union nested_table *ntbl; int i; ntbl = rcu_dereference(*prev); if (ntbl) return ntbl; ntbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); if (ntbl && leaf) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) INIT_RHT_NULLS_HEAD(ntbl[i].bucket); } if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) return ntbl; /* Raced with another thread. */ kfree(ntbl); return rcu_dereference(*prev); } static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); struct bucket_table *tbl; size_t size; if (nbuckets < (1 << (shift + 1))) return NULL; size = sizeof(*tbl) + sizeof(tbl->buckets[0]); tbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(size, gfp|__GFP_ZERO)); if (!tbl) return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, false)) { kfree(tbl); return NULL; } tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; return tbl; } static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { struct bucket_table *tbl = NULL; size_t size; int i; static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } if (tbl == NULL) return NULL; lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); tbl->size = size; rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *new_tbl; do { new_tbl = tbl; tbl = rht_dereference_rcu(tbl->future_tbl, ht); } while (tbl); return new_tbl; } static int rhashtable_rehash_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); int err = -EAGAIN; struct rhash_head *head, *next, *entry; struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; unsigned long flags; if (new_tbl->nest) goto out; err = -ENOENT; rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); if (rht_is_a_nulls(next)) break; pprev = &entry->next; } if (err) goto out; new_hash = head_hashfn(ht, new_tbl, entry); flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING); head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); if (pprev) rcu_assign_pointer(*pprev, next); else /* Need to preserved the bit lock. */ rht_assign_locked(bkt, next); out: return err; } static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); unsigned long flags; int err; if (!bkt) return 0; flags = rht_lock(old_tbl, bkt); while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; if (err == -ENOENT) err = 0; rht_unlock(old_tbl, bkt, flags); return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. * As cmpxchg() provides strong barriers, we do not need * rcu_assign_pointer(). */ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, new_tbl) != NULL) return -EEXIST; return 0; } static int rhashtable_rehash_table(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; cond_resched(); } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. * We do this inside the locked region so that * rhashtable_walk_stop() can use rcu_head_after_call_rcu() * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } static int rhashtable_rehash_alloc(struct rhashtable *ht, struct bucket_table *old_tbl, unsigned int size) __must_hold(&ht->mutex) { struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) bucket_table_free(new_tbl); return err; } /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * * This function shrinks the hash table to fit, i.e., the smallest * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. * * The caller must ensure that no concurrent table mutations take place. * It is however valid to have concurrent lookups if they are RCU protected. * * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ static int rhashtable_shrink(struct rhashtable *ht) __must_hold(&ht->mutex) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; if (old_tbl->size <= size) return 0; if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) err = rhashtable_shrink(ht); else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); if (!err || err == -EEXIST) { int nerr; nerr = rhashtable_rehash_table(ht); err = err ?: nerr; } mutex_unlock(&ht->mutex); if (err) schedule_work(&ht->run_work); } static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); size = tbl->size; err = -EBUSY; if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) goto fail; err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { bucket_table_free(new_tbl); if (err == -EEXIST) err = 0; } else schedule_work(&ht->run_work); return err; fail: /* Do not fail the insert if someone else did a rehash. */ if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) schedule_work(&ht->run_work); return err; } static void *rhashtable_lookup_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; int elasticity; elasticity = RHT_ELASTICITY; rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; elasticity--; if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } if (!ht->rhlist) return rht_obj(ht, head); list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) rcu_assign_pointer(*pprev, obj); else /* Need to preserve the bit lock */ rht_assign_locked(bkt, obj); return NULL; } if (elasticity <= 0) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); } static struct bucket_table *rhashtable_insert_one( struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, void *data) { struct bucket_table *new_tbl; struct rhash_head *head; if (!IS_ERR_OR_NULL(data)) return ERR_PTR(-EEXIST); if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (new_tbl) return new_tbl; if (PTR_ERR(data) != -ENOENT) return ERR_CAST(data); if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } /* bkt is always the head of the list, so it holds * the lock, which we need to preserve */ rht_assign_locked(bkt, obj); return NULL; } static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, struct rhash_head *obj) { struct bucket_table *new_tbl; struct bucket_table *tbl; struct rhash_lock_head __rcu **bkt; unsigned long flags; unsigned int hash; void *data; new_tbl = rcu_dereference(ht->tbl); do { tbl = new_tbl; hash = rht_head_hashfn(ht, tbl, obj, ht->p); if (rcu_access_pointer(tbl->future_tbl)) /* Failure is OK */ bkt = rht_bucket_var(tbl, hash); else bkt = rht_bucket_insert(ht, tbl, hash); if (bkt == NULL) { new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); data = ERR_PTR(-EAGAIN); } else { bool inserted; flags = rht_lock(tbl, bkt); data = rhashtable_lookup_one(ht, bkt, tbl, hash, key, obj); new_tbl = rhashtable_insert_one(ht, bkt, tbl, hash, obj, data); inserted = data && !new_tbl; if (inserted) atomic_inc(&ht->nelems); if (PTR_ERR(new_tbl) != -EEXIST) data = ERR_CAST(new_tbl); rht_unlock(tbl, bkt, flags); if (inserted && rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); } } while (!IS_ERR_OR_NULL(new_tbl)); if (PTR_ERR(data) == -EAGAIN) data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: -EAGAIN); return data; } void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj) { void *data; do { rcu_read_lock(); data = rhashtable_try_insert(ht, key, obj); rcu_read_unlock(); } while (PTR_ERR(data) == -EAGAIN); return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptible context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; iter->end_of_table = 0; spin_lock(&ht->lock); iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator * @iter: Hash table Iterator * * This function frees resources allocated by rhashtable_walk_enter. */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); /** * rhashtable_walk_start_check - Start a hash table walk * @iter: Hash table iterator * * Start a hash table walk at the current iterator position. Note that we take * the RCU lock in all cases including when we return an error. So you must * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may use it immediately * by calling rhashtable_walk_next. * * rhashtable_walk_start is defined as an inline variant that returns * void. This is preferred in cases where the caller would ignore * resize events and always continue. */ int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires_shared(RCU) { struct rhashtable *ht = iter->ht; bool rhlist = ht->rhlist; rcu_read_lock(); spin_lock(&ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&ht->lock); if (iter->end_of_table) return 0; if (!iter->walker.tbl) { iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); iter->slot = 0; iter->skip = 0; return -EAGAIN; } if (iter->p && !rhlist) { /* * We need to validate that 'p' is still in the table, and * if so, update 'skip' */ struct rhash_head *p; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { skip++; if (p == iter->p) { iter->skip = skip; goto found; } } iter->p = NULL; } else if (iter->p && rhlist) { /* Need to validate that 'list' is still in the table, and * if so, update 'skip' and 'p'. */ struct rhash_head *p; struct rhlist_head *list; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { for (list = container_of(p, struct rhlist_head, rhead); list; list = rcu_dereference(list->next)) { skip++; if (list == iter->list) { iter->p = p; iter->skip = skip; goto found; } } } iter->p = NULL; } found: return 0; } EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); /** * __rhashtable_walk_find_next - Find the next element in a table (or the first * one in case of a new walk). * * @iter: Hash table iterator * * Returns the found object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. */ static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) { struct bucket_table *tbl = iter->walker.tbl; struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (!tbl) return NULL; for (; iter->slot < tbl->size; iter->slot++) { int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { if (rhlist) { list = container_of(p, struct rhlist_head, rhead); do { if (!skip) goto next; skip--; list = rcu_dereference(list->next); } while (list); continue; } if (!skip) break; skip--; } next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; } iter->p = NULL; /* Ensure we see any new tables. */ smp_rmb(); iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); } else { iter->end_of_table = true; } return NULL; } /** * rhashtable_walk_next - Return the next object and advance the iterator * @iter: Hash table iterator * * Note that you must call rhashtable_walk_stop when you are finished * with the walk. * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (p) { if (!rhlist || !(list = rcu_dereference(list->next))) { p = rcu_dereference(p->next); list = container_of(p, struct rhlist_head, rhead); } if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } /* At the end of this slot, switch to next one and then find * next entry from that point. */ iter->skip = 0; iter->slot++; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_next); /** * rhashtable_walk_peek - Return the next object but don't advance the iterator * @iter: Hash table iterator * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_peek(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; if (p) return rht_obj(ht, ht->rhlist ? &list->rhead : p); /* No object found in current iter, find next one in the table. */ if (iter->skip) { /* A nonzero skip value points to the next entry in the table * beyond that last one that was found. Decrement skip so * we find the current value. __rhashtable_walk_find_next * will restore the original value of skip assuming that * the table hasn't changed. */ iter->skip--; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_peek); /** * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * * Finish a hash table walk. Does not reset the iterator to the start of the * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) { struct rhashtable *ht; struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; ht = iter->ht; spin_lock(&ht->lock); if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) /* This bucket table is being freed, don't re-link it. */ iter->walker.tbl = NULL; else list_add(&iter->walker.list, &tbl->walkers); spin_unlock(&ht->lock); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { size_t retsize; if (params->nelem_hint) retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), (unsigned long)params->min_size); else retsize = max(HASH_DEFAULT_SIZE, (unsigned long)params->min_size); return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) { return jhash2(key, length, seed); } /** * rhashtable_init - initialize a new hash table * @ht: hash table to be initialized * @params: configuration parameters * * Initializes a new hash table based on the provided configuration * parameters. A table can be configured either with a variable or * fixed length key: * * Configuration Example 1: Fixed length keys * struct test_obj { * int key; * void * my_member; * struct rhash_head node; * }; * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, * }; * * Configuration Example 2: Variable length keys * struct test_obj { * [...] * struct rhash_head node; * }; * * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * * return [... hash ...]; * } * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .hashfn = jhash, * .obj_hashfn = my_hash_fn, * }; */ int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); alloc_tag_record(ht->alloc_tag); if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); /* Cap total entries at 2^31 to avoid nelems overflow. */ ht->max_elems = 1u << 31; if (params->max_size) { ht->p.max_size = rounddown_pow_of_two(params->max_size); if (ht->p.max_size < ht->max_elems / 2) ht->max_elems = ht->p.max_size * 2; } ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { ht->p.hashfn = jhash; if (!(ht->key_len & (sizeof(u32) - 1))) { ht->key_len /= sizeof(u32); ht->p.hashfn = rhashtable_jhash2; } } /* * This is api initialization and thus we need to guarantee the * initial rhashtable allocation. Upon failure, retry with the * smallest possible size with __GFP_NOFAIL semantics. */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (unlikely(tbl == NULL)) { size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); } atomic_set(&ht->nelems, 0); RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init_noprof); /** * rhltable_init - initialize a new hash list table * @hlt: hash list table to be initialized * @params: configuration parameters * * Initializes a new hash list table. * * See documentation for rhashtable_init. */ int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) { int err; err = rhashtable_init_noprof(&hlt->ht, params); hlt->ht.rhlist = true; return err; } EXPORT_SYMBOL_GPL(rhltable_init_noprof); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), void *arg) { struct rhlist_head *list; if (!ht->rhlist) { free_fn(rht_obj(ht, obj), arg); return; } list = container_of(obj, struct rhlist_head, rhead); do { obj = &list->rhead; list = rht_dereference(list->next, ht); free_fn(rht_obj(ht, obj), arg); } while (list); } /** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * Stops an eventual async resize. If defined, invokes free_fn for each * element to releasal resources. Please note that RCU protected * readers may still be accessing the elements. Releasing of resources * must occur in a compatible manner. Then frees the bucket array. * * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { struct bucket_table *tbl, *next_tbl; unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); void rhashtable_destroy(struct rhashtable *ht) { return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; unsigned int subhash = hash; union nested_table *ntbl; ntbl = nested_table_top(tbl); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; while (ntbl && size > (1 << shift)) { index = subhash & ((1 << shift) - 1); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); size >>= shift; subhash >>= shift; } if (!ntbl) return NULL; return &ntbl[subhash].bucket; } EXPORT_SYMBOL_GPL(__rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { static struct rhash_lock_head __rcu *rhnull; if (!rhnull) INIT_RHT_NULLS_HEAD(rhnull); return __rht_bucket_nested(tbl, hash) ?: &rhnull; } EXPORT_SYMBOL_GPL(rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; ntbl = nested_table_top(tbl); hash >>= tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); size >>= shift; hash >>= shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); } if (!ntbl) return NULL; return &ntbl[hash].bucket; } EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 /* * Mapping of UID/GIDs to name and vice versa. * * Copyright (c) 2002, 2003 The Regents of the University of * Michigan. All rights reserved. * * Marius Aamodt Eriksen <marius@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sunrpc/svc_xprt.h> #include <net/net_namespace.h> #include "idmap.h" #include "nfsd.h" #include "netns.h" #include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. */ static bool nfs4_disable_idmapping = true; module_param(nfs4_disable_idmapping, bool, 0644); MODULE_PARM_DESC(nfs4_disable_idmapping, "Turn off server's NFSv4 idmapping when using 'sec=sys'"); /* * Cache entry */ /* * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on * that. */ struct ent { struct cache_head h; int type; /* User / Group */ u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; struct rcu_head rcu_head; }; /* Common entry handling */ #define ENT_HASHBITS 8 #define ENT_HASHMAX (1 << ENT_HASHBITS) static void ent_init(struct cache_head *cnew, struct cache_head *citm) { struct ent *new = container_of(cnew, struct ent, h); struct ent *itm = container_of(citm, struct ent, h); new->id = itm->id; new->type = itm->type; strscpy(new->name, itm->name, sizeof(new->name)); strscpy(new->authname, itm->authname, sizeof(new->authname)); } static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); kfree_rcu(map, rcu_head); } static struct cache_head * ent_alloc(void) { struct ent *e = kmalloc_obj(*e); if (e) return &e->h; else return NULL; } /* * ID -> Name cache */ static uint32_t idtoname_hash(struct ent *ent) { uint32_t hash; hash = hash_str(ent->authname, ENT_HASHBITS); hash = hash_long(hash ^ ent->id, ENT_HASHBITS); /* Flip LSB for user/group */ if (ent->type == IDMAP_TYPE_GROUP) hash ^= 1; return hash; } static int idtoname_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); char idstr[11]; qword_add(bpp, blen, ent->authname); snprintf(idstr, sizeof(idstr), "%u", ent->id); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, idstr); (*bpp)[-1] = '\n'; } static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->id == b->id && a->type == b->type && strcmp(a->authname, b->authname) == 0); } static int idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type id [name]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %u", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->id); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %s", ent->name); seq_putc(m, '\n'); return 0; } static void warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", has_died ? "died" : "not been started"); } static int idtoname_parse(struct cache_detail *, char *, int); static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, .cache_upcall = idtoname_upcall, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, .match = idtoname_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int idtoname_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1, *bp; int len; int error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* ID */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.id = simple_strtoul(buf1, &bp, 10); if (bp == buf1) goto out; /* expiry */ error = get_expiry(&buf, &ent.h.expiry_time); if (error) goto out; error = -ENOMEM; res = idtoname_lookup(cd, &ent); if (!res) goto out; /* Name */ error = -EINVAL; len = qword_get(&buf, buf1, PAGE_SIZE); if (len < 0 || len >= IDMAP_NAMESZ) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); else memcpy(ent.name, buf1, sizeof(ent.name)); error = -ENOMEM; res = idtoname_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return error; } static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, idtoname_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Name -> ID cache */ static inline int nametoid_hash(struct ent *ent) { return hash_str(ent->name, ENT_HASHBITS); } static int nametoid_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); qword_add(bpp, blen, ent->authname); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, ent->name); (*bpp)[-1] = '\n'; } static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->type == b->type && strcmp(a->name, b->name) == 0 && strcmp(a->authname, b->authname) == 0); } static int nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type name [id]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %s", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->name); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %u", ent->id); seq_putc(m, '\n'); return 0; } static struct ent *nametoid_lookup(struct cache_detail *, struct ent *); static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, .cache_upcall = nametoid_upcall, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, .match = nametoid_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1; int len, error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* Name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ error = get_expiry(&buf, &ent.h.expiry_time); if (error) goto out; /* ID */ error = get_int(&buf, &ent.id); if (error == -EINVAL) goto out; if (error == -ENOENT) set_bit(CACHE_NEGATIVE, &ent.h.flags); error = -ENOMEM; res = nametoid_lookup(cd, &ent); if (res == NULL) goto out; res = nametoid_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return (error); } static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, nametoid_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Exported API */ int nfsd_idmap_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net); if (IS_ERR(nn->idtoname_cache)) return PTR_ERR(nn->idtoname_cache); rv = cache_register_net(nn->idtoname_cache, net); if (rv) goto destroy_idtoname_cache; nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); if (IS_ERR(nn->nametoid_cache)) { rv = PTR_ERR(nn->nametoid_cache); goto unregister_idtoname_cache; } rv = cache_register_net(nn->nametoid_cache, net); if (rv) goto destroy_nametoid_cache; return 0; destroy_nametoid_cache: cache_destroy_net(nn->nametoid_cache, net); unregister_idtoname_cache: cache_unregister_net(nn->idtoname_cache, net); destroy_idtoname_cache: cache_destroy_net(nn->idtoname_cache, net); return rv; } void nfsd_idmap_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_unregister_net(nn->idtoname_cache, net); cache_unregister_net(nn->nametoid_cache, net); cache_destroy_net(nn->idtoname_cache, net); cache_destroy_net(nn->nametoid_cache, net); } static int idmap_lookup(struct svc_rqst *rqstp, struct ent *(*lookup_fn)(struct cache_detail *, struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { int ret; *item = lookup_fn(detail, key); if (!*item) return -ENOMEM; retry: ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); if (ret == -ETIMEDOUT) { struct ent *prev_item = *item; *item = lookup_fn(detail, key); if (*item != prev_item) goto retry; cache_put(&(*item)->h, detail); } return ret; } static char * rqst_authname(struct svc_rqst *rqstp) { struct auth_domain *clp; clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; return clp->name; } static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { struct ent *item, key = { .type = type, }; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (namelen + 1 > sizeof(key.name)) return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; if (ret) return nfserrno(ret); *id = item->id; cache_put(&item->h, nn->nametoid_cache); return 0; } static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) { char buf[11]; int len; __be32 *p; len = sprintf(buf, "%u", id); p = xdr_reserve_space(xdr, len + 4); if (!p) return nfserr_resource; p = xdr_encode_opaque(p, buf, len); return 0; } static __be32 idmap_id_to_name(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; __be32 status = nfs_ok; __be32 *p; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); if (ret) return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); p = xdr_reserve_space(xdr, ret + 4); if (unlikely(!p)) { status = nfserr_resource; goto out_put; } xdr_encode_opaque(p, item->name, ret); out_put: cache_put(&item->h, nn->idtoname_cache); return status; } static bool numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { int ret; char buf[11]; if (namelen + 1 > sizeof(buf)) /* too long to represent a 32-bit id: */ return false; /* Just to make sure it's null-terminated: */ memcpy(buf, name, namelen); buf[namelen] = '\0'; ret = kstrtouint(buf, 10, id); return ret == 0; } static __be32 do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) return 0; /* * otherwise, fall through and try idmapping, for * backwards compatibility with clients sending names: */ return idmap_name_to_id(rqstp, type, name, namelen, id); } static __be32 encode_name_from_id(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return encode_ascii_id(xdr, id); return idmap_id_to_name(xdr, rqstp, type, id); } /** * nfsd_map_name_to_uid - Map user@domain to local UID * @rqstp: RPC execution context * @name: user@domain name to be mapped * @namelen: length of name, in bytes * @uid: OUT: mapped local UID value * * Returns nfs_ok on success or an NFSv4 status code on failure. */ __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, kuid_t *uid) { __be32 status; u32 id = -1; /* * The idmap lookup below triggers an upcall that invokes * cache_check(). RQ_USEDEFERRAL must be clear to prevent * cache_check() from setting RQ_DROPME via svc_defer(). * NFSv4 servers are not permitted to drop requests. Also * RQ_DROPME will force NFSv4.1 session slot processing to * be skipped. */ WARN_ON_ONCE(test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags)); if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); if (status) return status; *uid = make_kuid(nfsd_user_namespace(rqstp), id); if (!uid_valid(*uid)) status = nfserr_badowner; return status; } /** * nfsd_map_name_to_gid - Map user@domain to local GID * @rqstp: RPC execution context * @name: user@domain name to be mapped * @namelen: length of name, in bytes * @gid: OUT: mapped local GID value * * Returns nfs_ok on success or an NFSv4 status code on failure. */ __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, kgid_t *gid) { __be32 status; u32 id = -1; /* * The idmap lookup below triggers an upcall that invokes * cache_check(). RQ_USEDEFERRAL must be clear to prevent * cache_check() from setting RQ_DROPME via svc_defer(). * NFSv4 servers are not permitted to drop requests. Also * RQ_DROPME will force NFSv4.1 session slot processing to * be skipped. */ WARN_ON_ONCE(test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags)); if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); if (status) return status; *gid = make_kgid(nfsd_user_namespace(rqstp), id); if (!gid_valid(*gid)) status = nfserr_badowner; return status; } __be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, kuid_t uid) { u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } __be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, kgid_t gid) { u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); }
11 11 11 11 11 1 1 1 1 1 1 1 1 1 11 11 11 9 1 1 8 18 18 13 12 10 3 5 1 3 1 5 3 1 2 8 2 1 1 4 3 6 1 3 2 1 1 4 3 7 11 4 18 18 2 16 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> #include "netdevsim.h" #define pr_vlog(env, fmt, ...) \ bpf_verifier_log_write(env, "[netdevsim] " fmt, ##__VA_ARGS__) struct nsim_bpf_bound_prog { struct nsim_dev *nsim_dev; struct bpf_prog *prog; struct dentry *ddir; const char *state; bool is_loaded; struct list_head l; }; #define NSIM_BPF_MAX_KEYS 2 struct nsim_bpf_bound_map { struct netdevsim *ns; struct bpf_offloaded_map *map; struct mutex mutex; struct nsim_map_entry { void *key; void *value; } entry[NSIM_BPF_MAX_KEYS]; struct list_head l; }; static int nsim_bpf_string_show(struct seq_file *file, void *data) { const char **str = file->private; if (*str) seq_printf(file, "%s\n", *str); return 0; } DEFINE_SHOW_ATTRIBUTE(nsim_bpf_string); static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); if (!state->nsim_dev->bpf_bind_verifier_accept) ret = -EOPNOTSUPP; } return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) { return 0; } static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; } static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) { struct nsim_bpf_bound_prog *state; if (!prog || !bpf_prog_is_offloaded(prog->aux)) return; state = prog->aux->offload->dev_priv; state->is_loaded = loaded; } static int nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) { nsim_prog_set_loaded(ns->bpf_offloaded, false); WARN(!!ns->bpf_offloaded != oldprog, "bad offload state, expected offload %sto be active", oldprog ? "" : "not "); ns->bpf_offloaded = prog; ns->bpf_offloaded_id = prog ? prog->aux->id : 0; nsim_prog_set_loaded(prog, true); return 0; } int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct bpf_prog *prog = cls_bpf->prog; struct netdevsim *ns = cb_priv; struct bpf_prog *oldprog; if (type != TC_SETUP_CLSBPF) { NSIM_EA(cls_bpf->common.extack, "only offload of BPF classifiers supported"); return -EOPNOTSUPP; } if (!tc_cls_can_offload_and_chain0(ns->netdev, &cls_bpf->common)) return -EOPNOTSUPP; if (cls_bpf->common.protocol != htons(ETH_P_ALL)) { NSIM_EA(cls_bpf->common.extack, "only ETH_P_ALL supported as filter protocol"); return -EOPNOTSUPP; } if (!ns->bpf_tc_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject BPF TC offload"); return -EOPNOTSUPP; } /* Note: progs without skip_sw will probably not be dev bound */ if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject unbound programs"); return -EOPNOTSUPP; } if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; oldprog = cls_bpf->oldprog; /* Don't remove if oldprog doesn't match driver's state */ if (ns->bpf_offloaded != oldprog) { oldprog = NULL; if (!cls_bpf->prog) return 0; if (ns->bpf_offloaded) { NSIM_EA(cls_bpf->common.extack, "driver and netdev offload states mismatch"); return -EBUSY; } } return nsim_bpf_offload(ns, cls_bpf->prog, oldprog); } int nsim_bpf_disable_tc(struct netdevsim *ns) { if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) return -EBUSY; return 0; } static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { if (!nsim_xdp_offload_active(ns) && !bpf->prog) return 0; if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { NSIM_EA(bpf->extack, "TC program is already loaded"); return -EBUSY; } return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); } static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, struct xdp_attachment_info *xdp) { int err; if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW) { err = nsim_xdp_offload_prog(ns, bpf); if (err) return err; } xdp_attachment_setup(xdp, bpf); return 0; } static int nsim_bpf_create_prog(struct nsim_dev *nsim_dev, struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; char name[16]; int ret; state = kzalloc_obj(*state); if (!state) return -ENOMEM; state->nsim_dev = nsim_dev; state->prog = prog; state->state = "verify"; /* Program id is not populated yet when we create the state. */ sprintf(name, "%u", nsim_dev->prog_id_gen++); state->ddir = debugfs_create_dir(name, nsim_dev->ddir_bpf_bound_progs); if (IS_ERR(state->ddir)) { ret = PTR_ERR(state->ddir); kfree(state); return ret; } debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); debugfs_create_file("state", 0400, state->ddir, &state->state, &nsim_bpf_string_fops); debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); mutex_lock(&nsim_dev->progs_list_lock); list_add_tail(&state->l, &nsim_dev->bpf_bound_progs); mutex_unlock(&nsim_dev->progs_list_lock); prog->aux->offload->dev_priv = state; return 0; } static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { struct nsim_dev *nsim_dev = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!nsim_dev->bpf_bind_accept) return -EOPNOTSUPP; return nsim_bpf_create_prog(nsim_dev, prog); } static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; state->state = "xlated"; return 0; } static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; struct nsim_dev *nsim_dev; state = prog->aux->offload->dev_priv; nsim_dev = state->nsim_dev; WARN(state->is_loaded, "offload state destroyed while program still bound"); debugfs_remove_recursive(state->ddir); mutex_lock(&nsim_dev->progs_list_lock); list_del(&state->l); mutex_unlock(&nsim_dev->progs_list_lock); kfree(state); } static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); return -EINVAL; } if (bpf->prog && !bpf->prog->aux->xdp_has_frags && ns->netdev->mtu > NSIM_XDP_MAX_MTU) { NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); return -EINVAL; } return 0; } static int nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { struct nsim_bpf_bound_prog *state; if (!bpf->prog) return 0; if (!bpf_prog_is_offloaded(bpf->prog->aux)) { NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { NSIM_EA(bpf->extack, "offloading program in bad state"); return -EINVAL; } return 0; } static bool nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) { return e->key && !memcmp(key, e->key, map->key_size); } static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) return i; return -ENOENT; } static int nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].key) return -ENOMEM; nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].value) { kfree(nmap->entry[idx].key); nmap->entry[idx].key = NULL; return -ENOMEM; } return 0; } static int nsim_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx = -ENOENT; mutex_lock(&nmap->mutex); if (key) idx = nsim_map_key_find(offmap, key); if (idx == -ENOENT) idx = 0; else idx++; for (; idx < ARRAY_SIZE(nmap->entry); idx++) { if (nmap->entry[idx].key) { memcpy(next_key, nmap->entry[idx].key, offmap->map.key_size); break; } } mutex_unlock(&nmap->mutex); if (idx == ARRAY_SIZE(nmap->entry)) return -ENOENT; return 0; } static int nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) memcpy(value, nmap->entry[idx].value, offmap->map.value_size); mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static int nsim_map_update_elem(struct bpf_offloaded_map *offmap, void *key, void *value, u64 flags) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx, err = 0; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx < 0 && flags == BPF_EXIST) { err = idx; goto exit_unlock; } if (idx >= 0 && flags == BPF_NOEXIST) { err = -EEXIST; goto exit_unlock; } if (idx < 0) { for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) if (!nmap->entry[idx].key) break; if (idx == ARRAY_SIZE(nmap->entry)) { err = -E2BIG; goto exit_unlock; } err = nsim_map_alloc_elem(offmap, idx); if (err) goto exit_unlock; } memcpy(nmap->entry[idx].key, key, offmap->map.key_size); memcpy(nmap->entry[idx].value, value, offmap->map.value_size); exit_unlock: mutex_unlock(&nmap->mutex); return err; } static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) return -EINVAL; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) { kfree(nmap->entry[idx].key); kfree(nmap->entry[idx].value); memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); } mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static const struct bpf_map_dev_ops nsim_bpf_map_ops = { .map_get_next_key = nsim_map_get_next_key, .map_lookup_elem = nsim_map_lookup_elem, .map_update_elem = nsim_map_update_elem, .map_delete_elem = nsim_map_delete_elem, }; static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) return -EINVAL; if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) return -ENOMEM; if (offmap->map.map_flags) return -EINVAL; nmap = kzalloc_obj(*nmap, GFP_KERNEL_ACCOUNT); if (!nmap) return -ENOMEM; offmap->dev_priv = nmap; nmap->ns = ns; nmap->map = offmap; mutex_init(&nmap->mutex); if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { u32 *key; err = nsim_map_alloc_elem(offmap, i); if (err) goto err_free; key = nmap->entry[i].key; *key = i; memset(nmap->entry[i].value, 0, offmap->map.value_size); } } offmap->dev_ops = &nsim_bpf_map_ops; list_add_tail(&nmap->l, &ns->nsim_dev->bpf_bound_maps); return 0; err_free: while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } kfree(nmap); return err; } static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } list_del_init(&nmap->l); mutex_destroy(&nmap->mutex); kfree(nmap); } int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); int err; ASSERT_RTNL(); switch (bpf->command) { case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp); case XDP_SETUP_PROG_HW: err = nsim_setup_prog_hw_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp_hw); case BPF_OFFLOAD_MAP_ALLOC: if (!ns->bpf_map_accept) return -EOPNOTSUPP; return nsim_bpf_map_alloc(ns, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: nsim_bpf_map_free(bpf->offmap); return 0; default: return -EINVAL; } } int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) { int err; INIT_LIST_HEAD(&nsim_dev->bpf_bound_progs); INIT_LIST_HEAD(&nsim_dev->bpf_bound_maps); nsim_dev->ddir_bpf_bound_progs = debugfs_create_dir("bpf_bound_progs", nsim_dev->ddir); if (IS_ERR(nsim_dev->ddir_bpf_bound_progs)) return PTR_ERR(nsim_dev->ddir_bpf_bound_progs); nsim_dev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, nsim_dev); err = PTR_ERR_OR_ZERO(nsim_dev->bpf_dev); if (err) return err; nsim_dev->bpf_bind_accept = true; debugfs_create_bool("bpf_bind_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); nsim_dev->bpf_bind_verifier_accept = true; debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_accept); return 0; } void nsim_bpf_dev_exit(struct nsim_dev *nsim_dev) { WARN_ON(!list_empty(&nsim_dev->bpf_bound_progs)); WARN_ON(!list_empty(&nsim_dev->bpf_bound_maps)); bpf_offload_dev_destroy(nsim_dev->bpf_dev); } int nsim_bpf_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; int err; err = bpf_offload_dev_netdev_register(ns->nsim_dev->bpf_dev, ns->netdev); if (err) return err; debugfs_create_u32("bpf_offloaded_id", 0400, ddir, &ns->bpf_offloaded_id); ns->bpf_tc_accept = true; debugfs_create_bool("bpf_tc_accept", 0600, ddir, &ns->bpf_tc_accept); debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ddir, &ns->bpf_tc_non_bound_accept); ns->bpf_xdpdrv_accept = true; debugfs_create_bool("bpf_xdpdrv_accept", 0600, ddir, &ns->bpf_xdpdrv_accept); ns->bpf_xdpoffload_accept = true; debugfs_create_bool("bpf_xdpoffload_accept", 0600, ddir, &ns->bpf_xdpoffload_accept); ns->bpf_map_accept = true; debugfs_create_bool("bpf_map_accept", 0600, ddir, &ns->bpf_map_accept); return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); bpf_offload_dev_netdev_unregister(ns->nsim_dev->bpf_dev, ns->netdev); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0-or-later /* * sctp_offload - GRO/GSO Offloading for SCTP * * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/sctp.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/kfifo.h> #include <linux/time.h> #include <net/net_namespace.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/checksum.h> #include <net/protocol.h> #include <net/gso.h> static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; /* csum and csum_start in GSO CB may be needed to do the UDP * checksum when it's a UDP tunneling packet. */ SKB_GSO_CB(skb)->csum = (__force __wsum)~0; SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; if (!skb_is_gso_sctp(skb)) goto out; sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; __skb_pull(skb, sizeof(*sh)); if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ struct skb_shared_info *pinfo = skb_shinfo(skb); struct sk_buff *frag_iter; pinfo->gso_segs = 0; if (skb->len != skb->data_len) { /* Means we have chunks in here too */ pinfo->gso_segs++; } skb_walk_frags(skb, frag_iter) pinfo->gso_segs++; segs = NULL; goto out; } segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG); if (IS_ERR(segs)) goto out; /* All that is left is update SCTP CRC if necessary */ if (!(features & NETIF_F_SCTP_CRC)) { for (skb = segs; skb; skb = skb->next) { if (skb->ip_summed == CHECKSUM_PARTIAL) { sh = sctp_hdr(skb); sh->checksum = sctp_gso_make_checksum(skb); } } } out: return segs; } static const struct net_offload sctp_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; static const struct net_offload sctp6_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; int __init sctp_offload_init(void) { int ret; ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); if (ret) goto out; ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); if (ret) goto ipv4; return ret; ipv4: inet_del_offload(&sctp_offload, IPPROTO_SCTP); out: return ret; }
11 18991 18902 333 816 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * * Decrement the refcount, and if 0, call @release. The caller may not * pass NULL or kfree() as the release function. * * Return: 1 if this call removed the object, otherwise return 0. Beware, * if this function returns 0, another caller may have removed the object * by the time this function returns. The return value is only certain * if you want to see if the object is definitely released. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } /** * kref_put_mutex - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @mutex: Mutex which protects the release function. * * This variant of kref_lock() calls the @release function with the @mutex * held. The @release function will release the mutex. */ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true, mutex) { if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) { release(kref); return 1; } return 0; } /** * kref_put_lock - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @lock: Spinlock which protects the release function. * * This variant of kref_lock() calls the @release function with the @lock * held. The @release function will release the lock. */ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) __cond_acquires(true, lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. * * Return: non-zero if the increment succeeded. Otherwise return 0. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */
12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 Richard Henderson * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> * Copyright (C) 2024 Mike Rapoport IBM. */ #define pr_fmt(fmt) "execmem: " fmt #include <linux/mm.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/execmem.h> #include <linux/maple_tree.h> #include <linux/set_memory.h> #include <linux/moduleloader.h> #include <linux/text-patching.h> #include <asm/tlbflush.h> #include "internal.h" static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; unsigned int align = range->alignment; unsigned long start = range->start; unsigned long end = range->end; void *p; if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && range->fallback_start) { start = range->fallback_start; end = range->fallback_end; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); } if (!p) { pr_warn_ratelimited("unable to allocate memory\n"); return NULL; } if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { vfree(p); return NULL; } return p; } struct vm_struct *execmem_vmap(size_t size) { struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; struct vm_struct *area; area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->start, range->end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); if (!area && range->fallback_start) area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->fallback_start, range->fallback_end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); return area; } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } #endif /* CONFIG_MMU */ #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; unsigned int pending_free_cnt; /* protected by mutex */ }; /* delay to schedule asynchronous free if fast path free fails */ #define FREE_DELAY (msecs_to_jiffies(10)) /* mark entries in busy_areas that should be freed asynchronously */ #define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), }; static inline unsigned long mas_range_len(struct ma_state *mas) { return mas->last - mas->index + 1; } static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) { unsigned int nr = (1 << get_vm_area_page_order(vm)); unsigned int updated = 0; int err = 0; for (int i = 0; i < vm->nr_pages; i += nr) { err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); if (err) goto err_restore; updated += nr; } return 0; err_restore: for (int i = 0; i < updated; i += nr) set_direct_map_valid_noflush(vm->pages[i], nr, !valid); return err; } static int execmem_force_rw(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; int ret; ret = set_memory_nx(addr, nr); if (ret) return ret; return set_memory_rw(addr, nr); } int execmem_restore_rox(void *ptr, size_t size) { unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long addr = (unsigned long)ptr; return set_memory_rox(addr, nr); } static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; MA_STATE(mas, free_areas, 0, ULONG_MAX); void *area; mutex_lock(mutex); mas_for_each(&mas, area, ULONG_MAX) { size_t size = mas_range_len(&mas); if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, PMD_SIZE)) { struct vm_struct *vm = find_vm_area(area); execmem_set_direct_map_valid(vm, true); mas_store_gfp(&mas, NULL, GFP_KERNEL); vfree(area); } } mutex_unlock(mutex); } static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) { struct maple_tree *free_areas = &execmem_cache.free_areas; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; lower = addr; upper = addr + size - 1; area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; area = mas_next(&mas, ULONG_MAX); if (area && mas.index == addr + size) upper = mas.last; mas_set_range(&mas, lower, upper); return mas_store_gfp(&mas, (void *)lower, gfp_mask); } static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) { guard(mutex)(&execmem_cache.mutex); return execmem_cache_add_locked(ptr, size, gfp_mask); } static bool within_range(struct execmem_range *range, struct ma_state *mas, size_t size) { unsigned long addr = mas->index; if (addr >= range->start && addr + size < range->end) return true; if (range->fallback_start && addr >= range->fallback_start && addr + size < range->fallback_end) return true; return false; } static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; void *area, *ptr = NULL; int err; mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { area_size = mas_range_len(&mas_free); if (area_size >= size && within_range(range, &mas_free, size)) break; } if (area_size < size) goto out_unlock; addr = mas_free.index; last = mas_free.last; /* insert allocated size to busy_areas at range [addr, addr + size) */ mas_set_range(&mas_busy, addr, addr + size - 1); err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); if (err) goto out_unlock; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { void *ptr = (void *)(addr + size); /* * re-insert remaining free size to free_areas at range * [addr + size, last] */ mas_set_range(&mas_free, addr + size, last); err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); goto out_unlock; } } ptr = (void *)addr; out_unlock: mutex_unlock(mutex); return ptr; } static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; void *p; alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); if (!p) { alloc_size = size; p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); } if (!p) return err; vm = find_vm_area(p); if (!vm) goto err_free_mem; /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size); err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; return 0; err_reset_direct_map: execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { void *p; int err; p = __execmem_cache_alloc(range, size); if (p) return p; err = execmem_cache_populate(range, size); if (err) return NULL; return __execmem_cache_alloc(range, size); } static inline bool is_pending_free(void *ptr) { return ((unsigned long)ptr & PENDING_FREE_MASK); } static inline void *pending_free_set(void *ptr) { return (void *)((unsigned long)ptr | PENDING_FREE_MASK); } static inline void *pending_free_clear(void *ptr) { return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); } static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) { size_t size = mas_range_len(mas); int err; err = execmem_force_rw(ptr, size); if (err) return err; execmem_fill_trapping_insns(ptr, size); execmem_restore_rox(ptr, size); err = execmem_cache_add_locked(ptr, size, gfp_mask); if (err) return err; mas_store_gfp(mas, NULL, gfp_mask); return 0; } static void execmem_cache_free_slow(struct work_struct *work); static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); static void execmem_cache_free_slow(struct work_struct *work) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas, busy_areas, 0, ULONG_MAX); void *area; guard(mutex)(&execmem_cache.mutex); if (!execmem_cache.pending_free_cnt) return; mas_for_each(&mas, area, ULONG_MAX) { if (!is_pending_free(area)) continue; area = pending_free_clear(area); if (__execmem_cache_free(&mas, area, GFP_KERNEL)) continue; execmem_cache.pending_free_cnt--; } if (execmem_cache.pending_free_cnt) schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); else schedule_work(&execmem_cache_clean_work); } static bool execmem_cache_free(void *ptr) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); void *area; int err; guard(mutex)(&execmem_cache.mutex); area = mas_walk(&mas); if (!area) return false; err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); if (err) { /* * mas points to exact slot we've got the area from, nothing * else can modify the tree because of the mutex, so there * won't be any allocations in mas_store_gfp() and it will just * change the pointer. */ area = pending_free_set(area); mas_store_gfp(&mas, area, GFP_KERNEL); execmem_cache.pending_free_cnt++; schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); return true; } schedule_work(&execmem_cache_clean_work); return true; } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ /* * when ROX cache is not used the permissions defined by architectures for * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must * be writable anyway */ static inline int execmem_force_rw(void *ptr, size_t size) { return 0; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; } static bool execmem_cache_free(void *ptr) { return false; } #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p = NULL; size = PAGE_ALIGN(size); if (use_cache) p = execmem_cache_alloc(range, size); else p = execmem_vmalloc(range, size, pgprot, vm_flags); return kasan_reset_tag(p); } void *execmem_alloc_rw(enum execmem_type type, size_t size) { void *p __free(execmem) = execmem_alloc(type, size); int err; if (!p) return NULL; err = execmem_force_rw(p, size); if (err) return NULL; return no_free_ptr(p); } void execmem_free(void *ptr) { /* * This memory may be RO, and freeing RO memory in an interrupt is not * supported by vmalloc. */ WARN_ON(in_interrupt()); if (!execmem_cache_free(ptr)) vfree(ptr); } bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); } static bool execmem_validate(struct execmem_info *info) { struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { pr_crit("Invalid parameters for execmem allocator, module loading will fail"); return false; } if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { r = &info->ranges[i]; if (r->flags & EXECMEM_ROX_CACHE) { pr_warn_once("ROX cache is not supported\n"); r->flags &= ~EXECMEM_ROX_CACHE; } } } return true; } static void execmem_init_missing(struct execmem_info *info) { struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { struct execmem_range *r = &info->ranges[i]; if (!r->start) { if (i == EXECMEM_MODULE_DATA) r->pgprot = PAGE_KERNEL; else r->pgprot = default_range->pgprot; r->alignment = default_range->alignment; r->start = default_range->start; r->end = default_range->end; r->flags = default_range->flags; r->fallback_start = default_range->fallback_start; r->fallback_end = default_range->fallback_end; } } } struct execmem_info * __weak execmem_arch_setup(void) { return NULL; } static void __init __execmem_init(void) { struct execmem_info *info = execmem_arch_setup(); if (!info) { info = execmem_info = &default_execmem_info; info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; info->ranges[EXECMEM_DEFAULT].alignment = 1; } if (!execmem_validate(info)) return; execmem_init_missing(info); execmem_info = info; } #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE static int __init execmem_late_init(void) { __execmem_init(); return 0; } core_initcall(execmem_late_init); #else void __init execmem_init(void) { __execmem_init(); } #endif
3 3 3 6 3 5 5 5 5 5 5 9 9 3 1137 1141 1142 16 16 16 16 16 8 3 3 3 7 1 1 3 2 5 1 1 2 1 5 16 1 1 2 12 9 3 12 16 5 1 1 2 1 9 3 1 4 1 9 6 6 6 6 3 6 6 3 1 2 3 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 /* * net/tipc/bearer.c: TIPC bearer code * * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include "core.h" #include "bearer.h" #include "link.h" #include "discover.h" #include "monitor.h" #include "bcast.h" #include "netlink.h" #include "udp_media.h" #include "trace.h" #include "crypto.h" #define MAX_ADDR_STR 60 static struct tipc_media * const media_info_array[] = { &eth_media_info, #ifdef CONFIG_TIPC_MEDIA_IB &ib_media_info, #endif #ifdef CONFIG_TIPC_MEDIA_UDP &udp_media_info, #endif NULL }; static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); return rcu_dereference(tn->bearer_list[bearer_id]); } static void bearer_disable(struct net *net, struct tipc_bearer *b); static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); /** * tipc_media_find - locates specified media object by name * @name: name to locate */ struct tipc_media *tipc_media_find(const char *name) { u32 i; for (i = 0; media_info_array[i] != NULL; i++) { if (!strcmp(media_info_array[i]->name, name)) break; } return media_info_array[i]; } /** * media_find_id - locates specified media object by type identifier * @type: type identifier to locate */ static struct tipc_media *media_find_id(u8 type) { u32 i; for (i = 0; media_info_array[i] != NULL; i++) { if (media_info_array[i]->type_id == type) break; } return media_info_array[i]; } /** * tipc_media_addr_printf - record media address in print buffer * @buf: output buffer * @len: output buffer size remaining * @a: input media address */ int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) { char addr_str[MAX_ADDR_STR]; struct tipc_media *m; int ret; m = media_find_id(a->media_id); if (m && !m->addr2str(a, addr_str, sizeof(addr_str))) ret = scnprintf(buf, len, "%s(%s)", m->name, addr_str); else { u32 i; ret = scnprintf(buf, len, "UNKNOWN(%u)", a->media_id); for (i = 0; i < sizeof(a->value); i++) ret += scnprintf(buf + ret, len - ret, "-%x", a->value[i]); } return ret; } /** * bearer_name_validate - validate & (optionally) deconstruct bearer name * @name: ptr to bearer name string * @name_parts: ptr to area for bearer name components (or NULL if not needed) * * Return: 1 if bearer name is valid, otherwise 0. */ static int bearer_name_validate(const char *name, struct tipc_bearer_names *name_parts) { char name_copy[TIPC_MAX_BEARER_NAME]; char *media_name; char *if_name; u32 media_len; u32 if_len; /* copy bearer name & ensure length is OK */ if (strscpy(name_copy, name, TIPC_MAX_BEARER_NAME) < 0) return 0; /* ensure all component parts of bearer name are present */ media_name = name_copy; if_name = strchr(media_name, ':'); if (if_name == NULL) return 0; *(if_name++) = 0; media_len = if_name - media_name; if_len = strlen(if_name) + 1; /* validate component parts of bearer name */ if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) || (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME)) return 0; /* return bearer name components, if necessary */ if (name_parts) { if (strscpy(name_parts->media_name, media_name, TIPC_MAX_MEDIA_NAME) < 0) return 0; if (strscpy(name_parts->if_name, if_name, TIPC_MAX_IF_NAME) < 0) return 0; } return 1; } /** * tipc_bearer_find - locates bearer object with matching bearer name * @net: the applicable net namespace * @name: bearer name to locate */ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) { struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; u32 i; for (i = 0; i < MAX_BEARERS; i++) { b = rtnl_dereference(tn->bearer_list[i]); if (b && (!strcmp(b->name, name))) return b; } return NULL; } /* tipc_bearer_get_name - get the bearer name from its id. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @bearer_id: the id to get the name from. */ int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; if (bearer_id >= MAX_BEARERS) return -EINVAL; b = rtnl_dereference(tn->bearer_list[bearer_id]); if (!b) return -EINVAL; strcpy(name, b->name); return 0; } void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (b) tipc_disc_add_dest(b->disc); rcu_read_unlock(); } void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (b) tipc_disc_remove_dest(b->disc); rcu_read_unlock(); } /** * tipc_enable_bearer - enable bearer with the given name * @net: the applicable net namespace * @name: bearer name to enable * @disc_domain: bearer domain * @prio: bearer priority * @attr: nlattr array * @extack: netlink extended ack */ static int tipc_enable_bearer(struct net *net, const char *name, u32 disc_domain, u32 prio, struct nlattr *attr[], struct netlink_ext_ack *extack) { struct tipc_net *tn = tipc_net(net); struct tipc_bearer_names b_names; int with_this_prio = 1; struct tipc_bearer *b; struct tipc_media *m; struct sk_buff *skb; int bearer_id = 0; int res = -EINVAL; char *errstr = ""; u32 i; if (!bearer_name_validate(name, &b_names)) { NL_SET_ERR_MSG(extack, "Illegal name"); return res; } if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) { errstr = "illegal priority"; NL_SET_ERR_MSG(extack, "Illegal priority"); goto rejected; } m = tipc_media_find(b_names.media_name); if (!m) { errstr = "media not registered"; NL_SET_ERR_MSG(extack, "Media not registered"); goto rejected; } if (prio == TIPC_MEDIA_LINK_PRI) prio = m->priority; /* Check new bearer vs existing ones and find free bearer id if any */ bearer_id = MAX_BEARERS; i = MAX_BEARERS; while (i-- != 0) { b = rtnl_dereference(tn->bearer_list[i]); if (!b) { bearer_id = i; continue; } if (!strcmp(name, b->name)) { errstr = "already enabled"; NL_SET_ERR_MSG(extack, "Already enabled"); goto rejected; } if (b->priority == prio && (++with_this_prio > 2)) { pr_warn("Bearer <%s>: already 2 bearers with priority %u\n", name, prio); if (prio == TIPC_MIN_LINK_PRI) { errstr = "cannot adjust to lower"; NL_SET_ERR_MSG(extack, "Cannot adjust to lower"); goto rejected; } pr_warn("Bearer <%s>: trying with adjusted priority\n", name); prio--; bearer_id = MAX_BEARERS; i = MAX_BEARERS; with_this_prio = 1; } } if (bearer_id >= MAX_BEARERS) { errstr = "max 3 bearers permitted"; NL_SET_ERR_MSG(extack, "Max 3 bearers permitted"); goto rejected; } b = kzalloc_obj(*b, GFP_ATOMIC); if (!b) return -ENOMEM; strscpy(b->name, name); b->media = m; res = m->enable_media(net, b, attr); if (res) { kfree(b); errstr = "failed to enable media"; NL_SET_ERR_MSG(extack, "Failed to enable media"); goto rejected; } b->identity = bearer_id; b->tolerance = m->tolerance; b->min_win = m->min_win; b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); errstr = "failed to create discoverer"; NL_SET_ERR_MSG(extack, "Failed to create discoverer"); goto rejected; } /* Create monitoring data before accepting activate messages */ if (tipc_mon_create(net, bearer_id)) { bearer_disable(net, b); kfree_skb(skb); return -ENOMEM; } test_and_set_bit_lock(0, &b->up); rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); pr_info("Enabled bearer <%s>, priority %u\n", name, prio); return res; rejected: pr_warn("Enabling of bearer <%s> rejected, %s\n", name, errstr); return res; } /** * tipc_reset_bearer - Reset all links established over this bearer * @net: the applicable net namespace * @b: the target bearer */ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) { pr_info("Resetting bearer <%s>\n", b->name); tipc_node_delete_links(net, b->identity); tipc_disc_reset(net, b); return 0; } bool tipc_bearer_hold(struct tipc_bearer *b) { return (b && refcount_inc_not_zero(&b->refcnt)); } void tipc_bearer_put(struct tipc_bearer *b) { if (b && refcount_dec_and_test(&b->refcnt)) kfree_rcu(b, rcu); } /** * bearer_disable - disable this bearer * @net: the applicable net namespace * @b: the bearer to disable * * Note: This routine assumes caller holds RTNL lock. */ static void bearer_disable(struct net *net, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; pr_info("Disabling bearer <%s>\n", b->name); clear_bit_unlock(0, &b->up); tipc_node_delete_links(net, bearer_id); b->media->disable_media(b); RCU_INIT_POINTER(b->media_ptr, NULL); if (b->disc) tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); tipc_bearer_put(b); tipc_mon_delete(net, bearer_id); } int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]) { char *dev_name = strchr((const char *)b->name, ':') + 1; int hwaddr_len = b->media->hwaddr_len; u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; /* Find device with specified name */ dev = dev_get_by_name(net, dev_name); if (!dev) return -ENODEV; if (tipc_mtu_bad(dev)) { dev_put(dev); return -EINVAL; } if (dev == net->loopback_dev) { dev_put(dev); pr_info("Enabling <%s> not permitted\n", b->name); return -EINVAL; } /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) { memcpy(node_id, dev->dev_addr, hwaddr_len); tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { dev_put(dev); pr_warn("Failed to obtain node identity\n"); return -EINVAL; } /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); b->pt.dev = dev; b->pt.type = htons(ETH_P_TIPC); b->pt.func = tipc_l2_rcv_msg; dev_add_pack(&b->pt); memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); memcpy(b->bcast_addr.value, dev->broadcast, hwaddr_len); b->bcast_addr.media_id = b->media->type_id; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; b->media->raw2addr(b, &b->addr, (const char *)dev->dev_addr); rcu_assign_pointer(dev->tipc_ptr, b); return 0; } /* tipc_disable_l2_media - detach TIPC bearer from an L2 interface * @b: the target bearer * * Mark L2 bearer as inactive so that incoming buffers are thrown away */ void tipc_disable_l2_media(struct tipc_bearer *b) { struct net_device *dev; dev = (struct net_device *)rtnl_dereference(b->media_ptr); dev_remove_pack(&b->pt); RCU_INIT_POINTER(dev->tipc_ptr, NULL); synchronize_net(); dev_put(dev); } /** * tipc_l2_send_msg - send a TIPC packet out over an L2 interface * @net: the associated network namespace * @skb: the packet to be sent * @b: the bearer through which the packet is to be sent * @dest: peer destination address */ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dest) { struct net_device *dev; int delta; dev = (struct net_device *)rcu_dereference(b->media_ptr); if (!dev) return 0; delta = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb)); if ((delta > 0) && pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) { kfree_skb(skb); return 0; } skb_reset_network_header(skb); skb->dev = dev; skb->protocol = htons(ETH_P_TIPC); dev_hard_header(skb, dev, ETH_P_TIPC, dest->value, dev->dev_addr, skb->len); dev_queue_xmit(skb); return 0; } bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id) { bool supp = false; struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (b) supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT); rcu_read_unlock(); return supp; } int tipc_bearer_mtu(struct net *net, u32 bearer_id) { int mtu = 0; struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (b) mtu = b->mtu; rcu_read_unlock(); return mtu; } int tipc_bearer_min_mtu(struct net *net, u32 bearer_id) { int mtu = TIPC_MIN_BEARER_MTU; struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (b) mtu += b->encap_hlen; rcu_read_unlock(); return mtu; } /* tipc_bearer_xmit_skb - sends buffer to destination over bearer */ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_bearer *b; rcu_read_lock(); b = bearer_get(net, bearer_id); if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_xmit(net, &skb, b, dest, NULL); if (skb) #endif b->media->send_msg(net, skb, b, dest); } else { kfree_skb(skb); } rcu_read_unlock(); } /* tipc_bearer_xmit() -send buffer to destination over bearer */ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct tipc_bearer *b; struct sk_buff *skb, *tmp; if (skb_queue_empty(xmitq)) return; rcu_read_lock(); b = bearer_get(net, bearer_id); if (unlikely(!b)) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_xmit(net, &skb, b, dst, __dnode); if (skb) #endif b->media->send_msg(net, skb, b, dst); } else { kfree_skb(skb); } } rcu_read_unlock(); } /* tipc_bearer_bc_xmit() - broadcast buffers to all destinations */ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq) { struct tipc_net *tn = tipc_net(net); struct tipc_media_addr *dst; int net_id = tn->net_id; struct tipc_bearer *b; struct sk_buff *skb, *tmp; struct tipc_msg *hdr; rcu_read_lock(); b = bearer_get(net, bearer_id); if (unlikely(!b || !test_bit(0, &b->up))) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { hdr = buf_msg(skb); msg_set_non_seq(hdr, 1); msg_set_mc_netid(hdr, net_id); __skb_dequeue(xmitq); dst = &b->bcast_addr; #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_xmit(net, &skb, b, dst, NULL); if (skb) #endif b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } /** * tipc_l2_rcv_msg - handle incoming TIPC message from an interface * @skb: the received message * @dev: the net device that the packet was received on * @pt: the packet_type structure which was used to register this handler * @orig_dev: the original receive net device in case the device is a bond * * Accept only packets explicitly sent to this node, or broadcast packets; * ignores packets sent using interface multicast, and traffic sent to other * nodes (which can happen if interface is running in promiscuous mode). */ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct tipc_bearer *b; rcu_read_lock(); b = rcu_dereference(dev->tipc_ptr) ?: rcu_dereference(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb_mark_not_on_list(skb); TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; } rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /** * tipc_l2_device_event - handle device events from network device * @nb: the context of the notification * @evt: the type of event * @ptr: the net device that the event was on * * This function is called by the Ethernet driver in case of link * change event. */ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tipc_bearer *b; b = rtnl_dereference(dev->tipc_ptr); if (!b) return NOTIFY_DONE; trace_tipc_l2_device_event(dev, b, evt); switch (evt) { case NETDEV_CHANGE: if (netif_carrier_ok(dev) && netif_oper_up(dev)) { test_and_set_bit_lock(0, &b->up); break; } fallthrough; case NETDEV_GOING_DOWN: clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; case NETDEV_UP: test_and_set_bit_lock(0, &b->up); break; case NETDEV_CHANGEMTU: if (tipc_mtu_bad(dev)) { bearer_disable(net, b); break; } b->mtu = dev->mtu; tipc_reset_bearer(net, b); break; case NETDEV_CHANGEADDR: b->media->raw2addr(b, &b->addr, (const char *)dev->dev_addr); tipc_reset_bearer(net, b); break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: bearer_disable(net, b); break; } return NOTIFY_OK; } static struct notifier_block notifier = { .notifier_call = tipc_l2_device_event, .priority = 0, }; int tipc_bearer_setup(void) { return register_netdevice_notifier(&notifier); } void tipc_bearer_cleanup(void) { unregister_netdevice_notifier(&notifier); } void tipc_bearer_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; u32 i; for (i = 0; i < MAX_BEARERS; i++) { b = rtnl_dereference(tn->bearer_list[i]); if (b) { bearer_disable(net, b); tn->bearer_list[i] = NULL; } } } void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts) { struct net_device *dev = net->loopback_dev; struct sk_buff *skb, *_skb; int exp; skb_queue_walk(pkts, _skb) { skb = pskb_copy(_skb, GFP_ATOMIC); if (!skb) continue; exp = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb)); if (exp > 0 && pskb_expand_head(skb, exp, 0, GFP_ATOMIC)) { kfree_skb(skb); continue; } skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_TIPC, dev->dev_addr, dev->dev_addr, skb->len); skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, dev); netif_rx(skb); } } static int tipc_loopback_rcv_pkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *od) { consume_skb(skb); return NET_RX_SUCCESS; } int tipc_attach_loopback(struct net *net) { struct net_device *dev = net->loopback_dev; struct tipc_net *tn = tipc_net(net); if (!dev) return -ENODEV; netdev_hold(dev, &tn->loopback_pt.dev_tracker, GFP_KERNEL); tn->loopback_pt.dev = dev; tn->loopback_pt.type = htons(ETH_P_TIPC); tn->loopback_pt.func = tipc_loopback_rcv_pkt; dev_add_pack(&tn->loopback_pt); return 0; } void tipc_detach_loopback(struct net *net) { struct tipc_net *tn = tipc_net(net); dev_remove_pack(&tn->loopback_pt); netdev_put(net->loopback_dev, &tn->loopback_pt.dev_tracker); } /* Caller should hold rtnl_lock to protect the bearer */ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, struct tipc_bearer *bearer, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, nlflags, TIPC_NL_BEARER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER); if (!attrs) goto msg_full; if (nla_put_string(msg->skb, TIPC_NLA_BEARER_NAME, bearer->name)) goto attr_msg_full; prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_PROP); if (!prop) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, bearer->priority)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) goto prop_msg_full; nla_nest_end(msg->skb, prop); #ifdef CONFIG_TIPC_MEDIA_UDP if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) { if (tipc_udp_nl_add_bearer_data(msg, bearer)) goto attr_msg_full; } #endif nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; prop_msg_full: nla_nest_cancel(msg->skb, prop); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; int i = cb->args[0]; struct tipc_bearer *bearer; struct tipc_nl_msg msg; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); if (i == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (i = 0; i < MAX_BEARERS; i++) { bearer = rtnl_dereference(tn->bearer_list[i]); if (!bearer) continue; err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI); if (err) break; } rtnl_unlock(); cb->args[0] = i; return skb->len; } int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) { int err; char *name; struct sk_buff *rep; struct tipc_bearer *bearer; struct tipc_nl_msg msg; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = genl_info_net(info); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; rtnl_lock(); bearer = tipc_bearer_find(net, name); if (!bearer) { err = -EINVAL; NL_SET_ERR_MSG(info->extack, "Bearer not found"); goto err_out; } err = __tipc_nl_add_bearer(&msg, bearer, 0); if (err) goto err_out; rtnl_unlock(); return genlmsg_reply(rep, info); err_out: rtnl_unlock(); nlmsg_free(rep); return err; } int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) { int err; char *name; struct tipc_bearer *bearer; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); bearer = tipc_bearer_find(net, name); if (!bearer) { NL_SET_ERR_MSG(info->extack, "Bearer not found"); return -EINVAL; } bearer_disable(net, bearer); return 0; } int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_bearer_disable(skb, info); rtnl_unlock(); return err; } int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) { int err; char *bearer; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); u32 domain = 0; u32 prio; prio = TIPC_MEDIA_LINK_PRI; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; bearer = nla_data(attrs[TIPC_NLA_BEARER_NAME]); if (attrs[TIPC_NLA_BEARER_DOMAIN]) domain = nla_get_u32(attrs[TIPC_NLA_BEARER_DOMAIN]); if (attrs[TIPC_NLA_BEARER_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP], props); if (err) return err; if (props[TIPC_NLA_PROP_PRIO]) prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); } return tipc_enable_bearer(net, bearer, domain, prio, attrs, info->extack); } int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_bearer_enable(skb, info); rtnl_unlock(); return err; } int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) { int err; char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); rtnl_lock(); b = tipc_bearer_find(net, name); if (!b) { NL_SET_ERR_MSG(info->extack, "Bearer not found"); err = -EINVAL; goto out; } #ifdef CONFIG_TIPC_MEDIA_UDP if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { NL_SET_ERR_MSG(info->extack, "UDP option is unsupported"); err = -EINVAL; goto out; } err = tipc_udp_nl_bearer_add(b, attrs[TIPC_NLA_BEARER_UDP_OPTS]); } #endif out: rtnl_unlock(); return err; } int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); char *name; int err; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); b = tipc_bearer_find(net, name); if (!b) { NL_SET_ERR_MSG(info->extack, "Bearer not found"); return -EINVAL; } if (attrs[TIPC_NLA_BEARER_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP], props); if (err) return err; if (props[TIPC_NLA_PROP_TOL]) { b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL); } if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { NL_SET_ERR_MSG(info->extack, "MTU property is unsupported"); return -EINVAL; } #ifdef CONFIG_TIPC_MEDIA_UDP if (nla_get_u32(props[TIPC_NLA_PROP_MTU]) < b->encap_hlen + TIPC_MIN_BEARER_MTU) { NL_SET_ERR_MSG(info->extack, "MTU value is out-of-range"); return -EINVAL; } b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU); #endif } } return 0; } int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_bearer_set(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_add_media(struct tipc_nl_msg *msg, struct tipc_media *media, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, nlflags, TIPC_NL_MEDIA_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MEDIA); if (!attrs) goto msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MEDIA_NAME, media->name)) goto attr_msg_full; prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_MEDIA_PROP); if (!prop) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, media->priority)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) goto prop_msg_full; nla_nest_end(msg->skb, prop); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; prop_msg_full: nla_nest_cancel(msg->skb, prop); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; int i = cb->args[0]; struct tipc_nl_msg msg; if (i == MAX_MEDIA) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (; media_info_array[i] != NULL; i++) { err = __tipc_nl_add_media(&msg, media_info_array[i], NLM_F_MULTI); if (err) break; } rtnl_unlock(); cb->args[0] = i; return skb->len; } int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) { int err; char *name; struct tipc_nl_msg msg; struct tipc_media *media; struct sk_buff *rep; struct nlattr *attrs[TIPC_NLA_MEDIA_MAX + 1]; if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MEDIA_MAX, info->attrs[TIPC_NLA_MEDIA], tipc_nl_media_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_MEDIA_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!rep) return -ENOMEM; msg.skb = rep; msg.portid = info->snd_portid; msg.seq = info->snd_seq; rtnl_lock(); media = tipc_media_find(name); if (!media) { NL_SET_ERR_MSG(info->extack, "Media not found"); err = -EINVAL; goto err_out; } err = __tipc_nl_add_media(&msg, media, 0); if (err) goto err_out; rtnl_unlock(); return genlmsg_reply(rep, info); err_out: rtnl_unlock(); nlmsg_free(rep); return err; } int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) { int err; char *name; struct tipc_media *m; struct nlattr *attrs[TIPC_NLA_MEDIA_MAX + 1]; if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MEDIA_MAX, info->attrs[TIPC_NLA_MEDIA], tipc_nl_media_policy, info->extack); if (!attrs[TIPC_NLA_MEDIA_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); m = tipc_media_find(name); if (!m) { NL_SET_ERR_MSG(info->extack, "Media not found"); return -EINVAL; } if (attrs[TIPC_NLA_MEDIA_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_MEDIA_PROP], props); if (err) return err; if (props[TIPC_NLA_PROP_TOL]) m->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) { NL_SET_ERR_MSG(info->extack, "MTU property is unsupported"); return -EINVAL; } #ifdef CONFIG_TIPC_MEDIA_UDP if (tipc_udp_mtu_bad(nla_get_u32 (props[TIPC_NLA_PROP_MTU]))) { NL_SET_ERR_MSG(info->extack, "MTU value is out-of-range"); return -EINVAL; } m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); #endif } } return 0; } int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_media_set(skb, info); rtnl_unlock(); return err; }
21 21 83 280 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; }
8 8 6 7 6 6 6 7 6 3 3 18 4 7 7 6 3 7 2 18 18 9 9 8 1 9 9 4 4 4 8 5 5 5 4 1 10 10 8 4 4 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists * @mesh_iface: netdev struct of the mesh interface * * If the given mesh interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface) { struct net_device *upper = mesh_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !netif_is_bridge_master(upper)); dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * * Checks the presence of an IPv4 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_MFORWARD(in_dev)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR4; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * * Checks the presence of an IPv6 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; } #else static inline u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev); rcu_read_unlock(); return flags; } /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) flags |= BATADV_MCAST_WANT_NO_RTR4; if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node or behind its bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, struct net_device *bridge) { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; } /** * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags * @bat_priv: the bat priv with all the mesh interface information * * Checks if all active hard interfaces have an MTU larger or equal to 1280 * bytes (IPv6 minimum MTU). * * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise. */ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) { const struct batadv_hard_iface *hard_iface; struct list_head *iter; rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { rcu_read_unlock(); return BATADV_NO_FLAGS; } } rcu_read_unlock(); return BATADV_MCAST_HAVE_MC_PTYPE_CAPA; } /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the mesh interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. */ static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { struct net_device *dev = bat_priv->mesh_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv); if (!bridge) return mla_flags; dev_put(bridge); mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; } if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; } return mla_flags; } /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct in_device *in_dev; u8 mcast_addr[ETH_ALEN]; struct ip_mc_list *pmc; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) return 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return 0; } for (pmc = rcu_dereference(in_dev->mc_list); pmc; pmc = rcu_dereference(pmc->next_rcu)) { if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(pmc->multiaddr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(pmc->multiaddr)) continue; ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } /** * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ #if IS_ENABLED(CONFIG_IPV6) static int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct inet6_dev *in6_dev; u8 mcast_addr[ETH_ALEN]; struct ifmcaddr6 *pmc6; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) return 0; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) { rcu_read_unlock(); return 0; } for (pmc6 = rcu_dereference(in6_dev->mc_list); pmc6; pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } #else static inline int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { return 0; } #endif /** * batadv_mcast_mla_meshif_get() - get meshif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct net_device *bridge = batadv_mcast_get_bridge(dev); int ret4, ret6 = 0; if (bridge) dev = bridge; ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; } out: dev_put(bridge); return ret4 + ret6; } /** * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given mesh interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } #if IS_ENABLED(CONFIG_IPV6) if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } #endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->mesh_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the mesh interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * mesh interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->mesh_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the mesh interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a mesh interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *new_flags) { struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &old_flags->querier_ipv4, &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &old_flags->querier_ipv6, &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the mesh interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; char str_old_flags[] = "[.... . .]"; sprintf(str_old_flags, "[%c%c%c%s%s%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); } /** * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the mesh interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. */ static void batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) return; batadv_mcast_bridge_log(bat_priv, flags); batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); mcast_data.flags = flags->tvlv_flags; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.mla_flags = *flags; } /** * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the mesh interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *mesh_iface = bat_priv->mesh_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); batadv_mcast_mla_flags_update(bat_priv, &flags); spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); __batadv_mcast_mla_update(bat_priv); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ if (ipv4_is_local_multicast(iphdr->daddr)) *is_unsnoopable = true; else *is_routable = ETH_P_IP; return 0; } /** * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) *is_routable = ETH_P_IPV6; return 0; } /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable, is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable, is_routable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router * @bat_priv: the bat priv with all the mesh interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic * if the protocol is ETH_P_IP or the number of nodes which want all routable * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. */ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, int protocol) { switch (protocol) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_rtr4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_rtr6); default: return 0; } } /** * batadv_mcast_forw_mode_by_count() - get forwarding mode by count * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * @count: the number of originators the multicast packet need to be sent to * * For a multicast packet with multiple destination originators, checks which * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a * complete batman-adv multicast header. * * Return: * BATADV_FORW_MCAST: If all nodes have multicast packet routing * capabilities and an MTU >= 1280 on all hard interfaces (including us) * and the encapsulated multicast packet with all destination addresses * would still fit into an 1280 bytes batman-adv multicast packet * (excluding the outer ethernet frame) and we could successfully push * the full batman-adv multicast packet header. * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv * multicast packet and the amount of batman-adv unicast packets needed * is smaller or equal to the configured multicast fanout. * BATADV_FORW_BCAST: Otherwise. */ static enum batadv_forw_mode batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable, int count) { unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count); u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags; if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) && own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && skb->len + mcast_hdrlen <= IPV6_MIN_MTU && batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count)) return BATADV_FORW_MCAST; if (count <= atomic_read(&bat_priv->multicast_fanout)) return BATADV_FORW_UCASTS; return BATADV_FORW_BCAST; } /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; struct ethhdr *ethhdr; int rtr_count = 0; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable); total_count = tt_count + ip_count + unsnoop_count + rtr_count; if (!total_count) return BATADV_FORW_NONE; else if (unsnoop_count) return BATADV_FORW_BCAST; return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable, total_count); } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any multicast * listener registered in the translation table. A transmission is performed * via a batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; struct batadv_tt_orig_list_entry *orig_entry; struct batadv_tt_global_entry *tt_global; const u8 *addr = eth_hdr(skb)->h_dest; tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global) goto out; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_entry->orig_node); } rcu_read_unlock(); batadv_tt_global_entry_put(tt_global); out: return ret; } /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Sends copies of a frame with multicast destination to any node that signaled * interest in it, that is either via the translation table or the according * want-all flags. A transmission is performed via a batman-adv unicast packet * for each such destination node. * * The given skb is consumed/freed. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable) { int ret; ret = batadv_mcast_forw_tt(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } if (!is_routable) goto skip_mc_router; ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } skip_mc_router: consume_skb(skb); return ret; } /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates the counter and the list * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr4_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { atomic_inc(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr6_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { atomic_inc(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has * toggled then this method updates the counter accordingly. */ static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) && orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa); /* switched from flag unset to set */ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)) atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa); } /** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags * @tvlv_value_len: tvlv buffer length * * Return: multicast flags for the given tvlv buffer */ static u8 batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) { u8 mcast_flags = BATADV_NO_FLAGS; if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; if (!enabled) { mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; } /* remove redundant flags to avoid sending duplicate packets later */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; return mcast_flags; } /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags; mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } else if (!orig_mcast_enabled && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_tvlv_handler_register(bat_priv, NULL, NULL, batadv_mcast_forw_tracker_tvlv_handler, BATADV_TVLV_MCAST_TRACKER, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 or error code. */ int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) return -EMSGSIZE; return 0; } /** * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, orig_node->mcast_flags)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } } genlmsg_end(msg, hdr); return 0; } /** * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags * table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (idx < *idx_skip) goto skip; if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @bat_priv: the bat priv with all the mesh interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * * Return: 0 or error code. */ static int __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; long idx_tmp = *idx; while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, bucket_tmp, &idx_tmp)) break; bucket_tmp++; idx_tmp = 0; } *bucket = bucket_tmp; *idx = idx_tmp; return msg->len; } /** * batadv_mcast_netlink_get_primary() - get primary interface from netlink * callback * @cb: netlink callback structure * @primary_if: the primary interface pointer to return the result in * * Return: 0 or error code. */ static int batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } out: dev_put(mesh_iface); if (!ret && primary_if) *primary_if = hard_iface; else batadv_hardif_put(hard_iface); return ret; } /** * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct batadv_priv *bat_priv; long *bucket = &cb->args[0]; long *idx = &cb->args[1]; int ret; ret = batadv_mcast_netlink_get_primary(cb, &primary_if); if (ret) return ret; bat_priv = netdev_priv(primary_if->mesh_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; } /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); batadv_mcast_have_mc_ptype_update(bat_priv, orig, BATADV_MCAST_HAVE_MC_PTYPE_CAPA); spin_unlock_bh(&orig->mcast_handler_lock); }
16 5 10 1 3 8 8 4 4 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "netlink.h" #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include "generated/netlink.h" #include <uapi/linux/wireguard.h> #include <linux/if.h> #include <net/genetlink.h> #include <net/sock.h> #include <crypto/utils.h> static struct genl_family genl_family; static struct wg_device *lookup_interface(struct nlattr **attrs, struct sk_buff *skb) { struct net_device *dev = NULL; if (!attrs[WGDEVICE_A_IFINDEX] == !attrs[WGDEVICE_A_IFNAME]) return ERR_PTR(-EBADR); if (attrs[WGDEVICE_A_IFINDEX]) dev = dev_get_by_index(sock_net(skb->sk), nla_get_u32(attrs[WGDEVICE_A_IFINDEX])); else if (attrs[WGDEVICE_A_IFNAME]) dev = dev_get_by_name(sock_net(skb->sk), nla_data(attrs[WGDEVICE_A_IFNAME])); if (!dev) return ERR_PTR(-ENODEV); if (!dev->rtnl_link_ops || !dev->rtnl_link_ops->kind || strcmp(dev->rtnl_link_ops->kind, KBUILD_MODNAME)) { dev_put(dev); return ERR_PTR(-EOPNOTSUPP); } return netdev_priv(dev); } static int get_allowedips(struct sk_buff *skb, const u8 *ip, u8 cidr, int family) { struct nlattr *allowedip_nest; allowedip_nest = nla_nest_start(skb, 0); if (!allowedip_nest) return -EMSGSIZE; if (nla_put_u8(skb, WGALLOWEDIP_A_CIDR_MASK, cidr) || nla_put_u16(skb, WGALLOWEDIP_A_FAMILY, family) || nla_put(skb, WGALLOWEDIP_A_IPADDR, family == AF_INET6 ? sizeof(struct in6_addr) : sizeof(struct in_addr), ip)) { nla_nest_cancel(skb, allowedip_nest); return -EMSGSIZE; } nla_nest_end(skb, allowedip_nest); return 0; } struct dump_ctx { struct wg_device *wg; struct wg_peer *next_peer; u64 allowedips_seq; struct allowedips_node *next_allowedip; }; #define DUMP_CTX(cb) ((struct dump_ctx *)(cb)->args) static int get_peer(struct wg_peer *peer, struct sk_buff *skb, struct dump_ctx *ctx) { struct nlattr *allowedips_nest, *peer_nest = nla_nest_start(skb, 0); struct allowedips_node *allowedips_node = ctx->next_allowedip; bool fail; if (!peer_nest) return -EMSGSIZE; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, peer->handshake.remote_static); up_read(&peer->handshake.lock); if (fail) goto err; if (!allowedips_node) { const struct __kernel_timespec last_handshake = { .tv_sec = peer->walltime_last_handshake.tv_sec, .tv_nsec = peer->walltime_last_handshake.tv_nsec }; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PRESHARED_KEY, NOISE_SYMMETRIC_KEY_LEN, peer->handshake.preshared_key); up_read(&peer->handshake.lock); if (fail) goto err; if (nla_put(skb, WGPEER_A_LAST_HANDSHAKE_TIME, sizeof(last_handshake), &last_handshake) || nla_put_u16(skb, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, peer->persistent_keepalive_interval) || nla_put_u64_64bit(skb, WGPEER_A_TX_BYTES, peer->tx_bytes, WGPEER_A_UNSPEC) || nla_put_u64_64bit(skb, WGPEER_A_RX_BYTES, peer->rx_bytes, WGPEER_A_UNSPEC) || nla_put_u32(skb, WGPEER_A_PROTOCOL_VERSION, 1)) goto err; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr4), &peer->endpoint.addr4); else if (peer->endpoint.addr.sa_family == AF_INET6) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr6), &peer->endpoint.addr6); read_unlock_bh(&peer->endpoint_lock); if (fail) goto err; allowedips_node = list_first_entry_or_null(&peer->allowedips_list, struct allowedips_node, peer_list); } if (!allowedips_node) goto no_allowedips; if (!ctx->allowedips_seq) ctx->allowedips_seq = ctx->wg->peer_allowedips.seq; else if (ctx->allowedips_seq != ctx->wg->peer_allowedips.seq) goto no_allowedips; allowedips_nest = nla_nest_start(skb, WGPEER_A_ALLOWEDIPS); if (!allowedips_nest) goto err; list_for_each_entry_from(allowedips_node, &peer->allowedips_list, peer_list) { u8 cidr, ip[16] __aligned(__alignof(u64)); int family; family = wg_allowedips_read_node(allowedips_node, ip, &cidr); if (get_allowedips(skb, ip, cidr, family)) { nla_nest_end(skb, allowedips_nest); nla_nest_end(skb, peer_nest); ctx->next_allowedip = allowedips_node; return -EMSGSIZE; } } nla_nest_end(skb, allowedips_nest); no_allowedips: nla_nest_end(skb, peer_nest); ctx->next_allowedip = NULL; ctx->allowedips_seq = 0; return 0; err: nla_nest_cancel(skb, peer_nest); return -EMSGSIZE; } int wg_get_device_start(struct netlink_callback *cb) { struct wg_device *wg; wg = lookup_interface(genl_info_dump(cb)->attrs, cb->skb); if (IS_ERR(wg)) return PTR_ERR(wg); DUMP_CTX(cb)->wg = wg; return 0; } int wg_get_device_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct wg_peer *peer, *next_peer_cursor; struct dump_ctx *ctx = DUMP_CTX(cb); struct wg_device *wg = ctx->wg; struct nlattr *peers_nest; int ret = -EMSGSIZE; bool done = true; void *hdr; rtnl_lock(); mutex_lock(&wg->device_update_lock); cb->seq = wg->device_update_gen; next_peer_cursor = ctx->next_peer; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_family, NLM_F_MULTI, WG_CMD_GET_DEVICE); if (!hdr) goto out; genl_dump_check_consistent(cb, hdr); if (!ctx->next_peer) { if (nla_put_u16(skb, WGDEVICE_A_LISTEN_PORT, wg->incoming_port) || nla_put_u32(skb, WGDEVICE_A_FWMARK, wg->fwmark) || nla_put_u32(skb, WGDEVICE_A_IFINDEX, wg->dev->ifindex) || nla_put_string(skb, WGDEVICE_A_IFNAME, wg->dev->name)) goto out; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity) { if (nla_put(skb, WGDEVICE_A_PRIVATE_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_private) || nla_put(skb, WGDEVICE_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_public)) { up_read(&wg->static_identity.lock); goto out; } } up_read(&wg->static_identity.lock); } peers_nest = nla_nest_start(skb, WGDEVICE_A_PEERS); if (!peers_nest) goto out; ret = 0; lockdep_assert_held(&wg->device_update_lock); /* If the last cursor was removed in peer_remove or peer_remove_all, then * we just treat this the same as there being no more peers left. The * reason is that seq_nr should indicate to userspace that this isn't a * coherent dump anyway, so they'll try again. */ if (list_empty(&wg->peer_list) || (ctx->next_peer && ctx->next_peer->is_dead)) { nla_nest_cancel(skb, peers_nest); goto out; } peer = list_prepare_entry(ctx->next_peer, &wg->peer_list, peer_list); list_for_each_entry_continue(peer, &wg->peer_list, peer_list) { if (get_peer(peer, skb, ctx)) { done = false; break; } next_peer_cursor = peer; } nla_nest_end(skb, peers_nest); out: if (!ret && !done && next_peer_cursor) wg_peer_get(next_peer_cursor); wg_peer_put(ctx->next_peer); mutex_unlock(&wg->device_update_lock); rtnl_unlock(); if (ret) { genlmsg_cancel(skb, hdr); return ret; } genlmsg_end(skb, hdr); if (done) { ctx->next_peer = NULL; return 0; } ctx->next_peer = next_peer_cursor; return skb->len; /* At this point, we can't really deal ourselves with safely zeroing out * the private key material after usage. This will need an additional API * in the kernel for marking skbs as zero_on_free. */ } int wg_get_device_done(struct netlink_callback *cb) { struct dump_ctx *ctx = DUMP_CTX(cb); if (ctx->wg) dev_put(ctx->wg->dev); wg_peer_put(ctx->next_peer); return 0; } static int set_port(struct wg_device *wg, u16 port) { struct wg_peer *peer; if (wg->incoming_port == port) return 0; list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); if (!netif_running(wg->dev)) { wg->incoming_port = port; return 0; } return wg_socket_init(wg, port); } static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) { int ret = -EINVAL; u32 flags = 0; u16 family; u8 cidr; if (!attrs[WGALLOWEDIP_A_FAMILY] || !attrs[WGALLOWEDIP_A_IPADDR] || !attrs[WGALLOWEDIP_A_CIDR_MASK]) return ret; family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]); cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]); if (attrs[WGALLOWEDIP_A_FLAGS]) flags = nla_get_u32(attrs[WGALLOWEDIP_A_FLAGS]); if (family == AF_INET && cidr <= 32 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) { if (flags & WGALLOWEDIP_F_REMOVE_ME) ret = wg_allowedips_remove_v4(&peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); else ret = wg_allowedips_insert_v4(&peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); } else if (family == AF_INET6 && cidr <= 128 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) { if (flags & WGALLOWEDIP_F_REMOVE_ME) ret = wg_allowedips_remove_v6(&peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); else ret = wg_allowedips_insert_v6(&peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); } return ret; } static int set_peer(struct wg_device *wg, struct nlattr **attrs) { u8 *public_key = NULL, *preshared_key = NULL; struct wg_peer *peer = NULL; u32 flags = 0; int ret; ret = -EINVAL; if (attrs[WGPEER_A_PUBLIC_KEY] && nla_len(attrs[WGPEER_A_PUBLIC_KEY]) == NOISE_PUBLIC_KEY_LEN) public_key = nla_data(attrs[WGPEER_A_PUBLIC_KEY]); else goto out; if (attrs[WGPEER_A_PRESHARED_KEY] && nla_len(attrs[WGPEER_A_PRESHARED_KEY]) == NOISE_SYMMETRIC_KEY_LEN) preshared_key = nla_data(attrs[WGPEER_A_PRESHARED_KEY]); if (attrs[WGPEER_A_FLAGS]) flags = nla_get_u32(attrs[WGPEER_A_FLAGS]); ret = -EPFNOSUPPORT; if (attrs[WGPEER_A_PROTOCOL_VERSION]) { if (nla_get_u32(attrs[WGPEER_A_PROTOCOL_VERSION]) != 1) goto out; } peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, nla_data(attrs[WGPEER_A_PUBLIC_KEY])); ret = 0; if (!peer) { /* Peer doesn't exist yet. Add a new one. */ if (flags & (WGPEER_F_REMOVE_ME | WGPEER_F_UPDATE_ONLY)) goto out; /* The peer is new, so there aren't allowed IPs to remove. */ flags &= ~WGPEER_F_REPLACE_ALLOWEDIPS; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity && !memcmp(nla_data(attrs[WGPEER_A_PUBLIC_KEY]), wg->static_identity.static_public, NOISE_PUBLIC_KEY_LEN)) { /* We silently ignore peers that have the same public * key as the device. The reason we do it silently is * that we'd like for people to be able to reuse the * same set of API calls across peers. */ up_read(&wg->static_identity.lock); ret = 0; goto out; } up_read(&wg->static_identity.lock); peer = wg_peer_create(wg, public_key, preshared_key); if (IS_ERR(peer)) { ret = PTR_ERR(peer); peer = NULL; goto out; } /* Take additional reference, as though we've just been * looked up. */ wg_peer_get(peer); } if (flags & WGPEER_F_REMOVE_ME) { wg_peer_remove(peer); goto out; } if (preshared_key) { down_write(&peer->handshake.lock); memcpy(&peer->handshake.preshared_key, preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_write(&peer->handshake.lock); } if (attrs[WGPEER_A_ENDPOINT]) { struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]); size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]); struct endpoint endpoint = { { { 0 } } }; if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) { endpoint.addr4 = *(struct sockaddr_in *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) { endpoint.addr6 = *(struct sockaddr_in6 *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } } if (flags & WGPEER_F_REPLACE_ALLOWEDIPS) wg_allowedips_remove_by_peer(&wg->peer_allowedips, peer, &wg->device_update_lock); if (attrs[WGPEER_A_ALLOWEDIPS]) { struct nlattr *attr, *allowedip[WGALLOWEDIP_A_MAX + 1]; int rem; nla_for_each_nested(attr, attrs[WGPEER_A_ALLOWEDIPS], rem) { ret = nla_parse_nested(allowedip, WGALLOWEDIP_A_MAX, attr, NULL, NULL); if (ret < 0) goto out; ret = set_allowedip(peer, allowedip); if (ret < 0) goto out; } } if (attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]) { const u16 persistent_keepalive_interval = nla_get_u16( attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]); const bool send_keepalive = !peer->persistent_keepalive_interval && persistent_keepalive_interval && netif_running(wg->dev); peer->persistent_keepalive_interval = persistent_keepalive_interval; if (send_keepalive) wg_packet_send_keepalive(peer); } if (netif_running(wg->dev)) wg_packet_send_staged_packets(peer); out: wg_peer_put(peer); if (attrs[WGPEER_A_PRESHARED_KEY]) memzero_explicit(nla_data(attrs[WGPEER_A_PRESHARED_KEY]), nla_len(attrs[WGPEER_A_PRESHARED_KEY])); return ret; } int wg_set_device_doit(struct sk_buff *skb, struct genl_info *info) { struct wg_device *wg = lookup_interface(info->attrs, skb); u32 flags = 0; int ret; if (IS_ERR(wg)) { ret = PTR_ERR(wg); goto out_nodev; } rtnl_lock(); mutex_lock(&wg->device_update_lock); if (info->attrs[WGDEVICE_A_FLAGS]) flags = nla_get_u32(info->attrs[WGDEVICE_A_FLAGS]); if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) { struct net *net; rcu_read_lock(); net = rcu_dereference(wg->creating_net); ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0; rcu_read_unlock(); if (ret) goto out; } ++wg->device_update_gen; if (info->attrs[WGDEVICE_A_FWMARK]) { struct wg_peer *peer; wg->fwmark = nla_get_u32(info->attrs[WGDEVICE_A_FWMARK]); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); } if (info->attrs[WGDEVICE_A_LISTEN_PORT]) { ret = set_port(wg, nla_get_u16(info->attrs[WGDEVICE_A_LISTEN_PORT])); if (ret) goto out; } if (flags & WGDEVICE_F_REPLACE_PEERS) wg_peer_remove_all(wg); if (info->attrs[WGDEVICE_A_PRIVATE_KEY] && nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]) == NOISE_PUBLIC_KEY_LEN) { u8 *private_key = nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]); u8 public_key[NOISE_PUBLIC_KEY_LEN]; struct wg_peer *peer, *temp; bool send_staged_packets; if (!crypto_memneq(wg->static_identity.static_private, private_key, NOISE_PUBLIC_KEY_LEN)) goto skip_set_private_key; /* We remove before setting, to prevent race, which means doing * two 25519-genpub ops. */ if (curve25519_generate_public(public_key, private_key)) { peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, public_key); if (peer) { wg_peer_put(peer); wg_peer_remove(peer); } } down_write(&wg->static_identity.lock); send_staged_packets = !wg->static_identity.has_identity && netif_running(wg->dev); wg_noise_set_static_identity_private_key(&wg->static_identity, private_key); send_staged_packets = send_staged_packets && wg->static_identity.has_identity; wg_cookie_checker_precompute_device_keys(&wg->cookie_checker); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { wg_noise_precompute_static_static(peer); wg_noise_expire_current_peer_keypairs(peer); if (send_staged_packets) wg_packet_send_staged_packets(peer); } up_write(&wg->static_identity.lock); } skip_set_private_key: if (info->attrs[WGDEVICE_A_PEERS]) { struct nlattr *attr, *peer[WGPEER_A_MAX + 1]; int rem; nla_for_each_nested(attr, info->attrs[WGDEVICE_A_PEERS], rem) { ret = nla_parse_nested(peer, WGPEER_A_MAX, attr, NULL, NULL); if (ret < 0) goto out; ret = set_peer(wg, peer); if (ret < 0) goto out; } } ret = 0; out: mutex_unlock(&wg->device_update_lock); rtnl_unlock(); dev_put(wg->dev); out_nodev: if (info->attrs[WGDEVICE_A_PRIVATE_KEY]) memzero_explicit(nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]), nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY])); return ret; } static struct genl_family genl_family __ro_after_init = { .split_ops = wireguard_nl_ops, .n_split_ops = ARRAY_SIZE(wireguard_nl_ops), .name = WG_GENL_NAME, .version = WG_GENL_VERSION, .module = THIS_MODULE, .netnsok = true }; int __init wg_genetlink_init(void) { BUILD_BUG_ON(WG_KEY_LEN != NOISE_PUBLIC_KEY_LEN); BUILD_BUG_ON(WG_KEY_LEN != NOISE_SYMMETRIC_KEY_LEN); return genl_register_family(&genl_family); } void __exit wg_genetlink_uninit(void) { genl_unregister_family(&genl_family); }
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_udp.c: UDP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/udp.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> #include <net/ip.h> #include <net/ip6_checksum.h> static int udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int udphoff); static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct udphdr _udph, *uh; __be16 _ports[2], *ports = NULL; if (likely(!ip_vs_iph_icmp(iph))) { /* IPv6 fragments, only first fragment will hit this */ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); if (uh) ports = &uh->source; } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void udp_fast_csum_update(int af, struct udphdr *uhdr, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(uhdr->check)))); else #endif uhdr->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(uhdr->check)))); if (!uhdr->check) uhdr->check = CSUM_MANGLED_0; } static inline void udp_partial_csum_update(int af, struct udphdr *uhdr, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(uhdr->check)))); else #endif uhdr->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(uhdr->check)))); } INDIRECT_CALLABLE_SCOPE int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; unsigned int udphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* * Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - udphoff; else payload_csum = true; } udph = (void *)skb_network_header(skb) + udphoff; udph->source = cp->vport; /* * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) udph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - udphoff, cp->protocol, skb->csum); else #endif udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); } return 1; } static int udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct udphdr *udph; unsigned int udphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - udphoff; else payload_csum = true; } udph = (void *)skb_network_header(skb) + udphoff; udph->dest = cp->dport; /* * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) udph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - udphoff, cp->protocol, skb->csum); else #endif udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int udphoff) { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) return 0; if (uh->check != 0) { switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - udphoff, IPPROTO_UDP, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } } return 1; } static inline __u16 udp_app_hashkey(__be16 port) { return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) & UDP_APP_TAB_MASK; } static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); hash = udp_app_hashkey(port); list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int udp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = 5*60*HZ, [IP_VS_UDP_S_LAST] = 2*HZ, }; static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = "UDP", [IP_VS_UDP_S_LAST] = "BUG!", }; static const char * udp_state_name(int state) { if (state >= IP_VS_UDP_S_LAST) return "ERR!"; return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; } static void udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { if (unlikely(!pd)) { pr_err("UDP no ns data\n"); return; } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; if (direction == IP_VS_DIR_OUTPUT) ip_vs_control_assure_ct(cp); } static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); if (!pd->timeout_table) return -ENOMEM; return 0; } static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_udp = { .name = "UDP", .protocol = IPPROTO_UDP, .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __udp_init, .exit_netns = __udp_exit, .conn_schedule = udp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, .state_transition = udp_state_transition, .state_name = udp_state_name, .register_app = udp_register_app, .unregister_app = udp_unregister_app, .app_conn_bind = udp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, };
1141 1141 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
705 706 2 705 701 703 2 2 172 19 159 265 59 58 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - sysfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" static int sysfs_do_create_link_sd(struct kernfs_node *parent, struct kobject *target_kobj, const char *name, int warn) { struct kernfs_node *kn, *target = NULL; if (WARN_ON(!name || !parent)) return -EINVAL; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (target_kobj->sd) { target = target_kobj->sd; kernfs_get(target); } spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; kn = kernfs_create_link(parent, name, target); kernfs_put(target); if (!IS_ERR(kn)) return 0; if (warn && PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (!parent) return -EFAULT; return sysfs_do_create_link_sd(parent, target, name, warn); } /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 1); } EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. * * This function does the same as sysfs_create_link(), but it * doesn't warn if the link already exists. */ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 0); } EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @name: name of the symlink to remove. * * Unlike sysfs_remove_link sysfs_delete_link has enough information * to successfully delete symlinks in tagged directories. */ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; /* * We don't own @target and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (targ->sd && kernfs_ns_enabled(kobj->sd)) ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. */ void sysfs_remove_link(struct kobject *kobj, const char *name) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (targ->sd) old_ns = targ->sd->ns; result = -ENOENT; kn = kernfs_find_and_get_ns(parent, old, old_ns); if (!kn) goto out; result = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; result = kernfs_rename_ns(kn, parent, new, new_ns); out: kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
5 5 50 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_resources *res); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_res(struct request_queue *q, struct elevator_type *type, struct elevator_resources *res, unsigned int nr_hw_queues); int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set); void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); void blk_mq_free_sched_res(struct elevator_resources *res, struct elevator_type *type, struct blk_mq_tag_set *set); void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); /* * blk_mq_alloc_sched_data() - Allocates scheduler specific data * Returns: * - Pointer to allocated data on success * - NULL if no allocation needed * - ERR_PTR(-ENOMEM) in case of failure */ static inline void *blk_mq_alloc_sched_data(struct request_queue *q, struct elevator_type *e) { void *sched_data; if (!e || !e->ops.alloc_sched_data) return NULL; sched_data = e->ops.alloc_sched_data(q); return (sched_data) ?: ERR_PTR(-ENOMEM); } static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data) { if (e && e->ops.free_sched_data) e->ops.free_sched_data(data); } static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, unsigned int depth) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, depth); } static inline bool blk_mq_is_sync_read(blk_opf_t opf) { return op_is_sync(opf) && !op_is_write(opf); } #endif
20 20 20 20 20 20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 // SPDX-License-Identifier: GPL-2.0-or-later /* In-software asymmetric public-key crypto subtype * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKEY: "fmt #include <crypto/akcipher.h> #include <crypto/public_key.h> #include <crypto/sig.h> #include <keys/asymmetric-subtype.h> #include <linux/asn1.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> MODULE_DESCRIPTION("In-software asymmetric public-key subtype"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /* * Provide a part of a description of the key for /proc/keys. */ static void public_key_describe(const struct key *asymmetric_key, struct seq_file *m) { struct public_key *key = asymmetric_key->payload.data[asym_crypto]; if (key) seq_printf(m, "%s.%s", key->id_type, key->pkey_algo); } /* * Destroy a public key algorithm key. */ void public_key_free(struct public_key *key) { if (key) { kfree_sensitive(key->key); kfree(key->params); kfree(key); } } EXPORT_SYMBOL_GPL(public_key_free); /* * Destroy a public key algorithm key. */ static void public_key_destroy(void *payload0, void *payload3) { public_key_free(payload0); public_key_signature_free(payload3); } /* * Given a public_key, and an encoding and hash_algo to be used for signing * and/or verification with that key, determine the name of the corresponding * akcipher algorithm. Also check that encoding and hash_algo are allowed. */ static int software_key_determine_akcipher(const struct public_key *pkey, const char *encoding, const char *hash_algo, char alg_name[CRYPTO_MAX_ALG_NAME], bool *sig, enum kernel_pkey_operation op) { int n; *sig = true; if (!encoding) return -EINVAL; if (strcmp(pkey->pkey_algo, "rsa") == 0) { /* * RSA signatures usually use EMSA-PKCS1-1_5 [RFC3447 sec 8.2]. */ if (strcmp(encoding, "pkcs1") == 0) { *sig = op == kernel_pkey_sign || op == kernel_pkey_verify; if (!*sig) { /* * For encrypt/decrypt, hash_algo is not used * but allowed to be set for historic reasons. */ n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)", pkey->pkey_algo); } else { if (!hash_algo) hash_algo = "none"; n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "pkcs1(%s,%s)", pkey->pkey_algo, hash_algo); } return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0; } if (strcmp(encoding, "raw") != 0) return -EINVAL; /* * Raw RSA cannot differentiate between different hash * algorithms. */ if (hash_algo) return -EINVAL; *sig = false; } else if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) { if (strcmp(encoding, "x962") != 0 && strcmp(encoding, "p1363") != 0) return -EINVAL; /* * ECDSA signatures are taken over a raw hash, so they don't * differentiate between different hash algorithms. That means * that the verifier should hard-code a specific hash algorithm. * Unfortunately, in practice ECDSA is used with multiple SHAs, * so we have to allow all of them and not just one. */ if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "sha1") != 0 && strcmp(hash_algo, "sha224") != 0 && strcmp(hash_algo, "sha256") != 0 && strcmp(hash_algo, "sha384") != 0 && strcmp(hash_algo, "sha512") != 0 && strcmp(hash_algo, "sha3-256") != 0 && strcmp(hash_algo, "sha3-384") != 0 && strcmp(hash_algo, "sha3-512") != 0) return -EINVAL; n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", encoding, pkey->pkey_algo); return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0; } else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "streebog256") != 0 && strcmp(hash_algo, "streebog512") != 0) return -EINVAL; } else if (strcmp(pkey->pkey_algo, "mldsa44") == 0 || strcmp(pkey->pkey_algo, "mldsa65") == 0 || strcmp(pkey->pkey_algo, "mldsa87") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "none") != 0 && strcmp(hash_algo, "sha512") != 0) return -EINVAL; } else { /* Unknown public key algorithm */ return -ENOPKG; } if (strscpy(alg_name, pkey->pkey_algo, CRYPTO_MAX_ALG_NAME) < 0) return -EINVAL; return 0; } static u8 *pkey_pack_u32(u8 *dst, u32 val) { memcpy(dst, &val, sizeof(val)); return dst + sizeof(val); } /* * Query information about a key. */ static int software_key_query(const struct kernel_pkey_params *params, struct kernel_pkey_query *info) { struct public_key *pkey = params->key->payload.data[asym_crypto]; char alg_name[CRYPTO_MAX_ALG_NAME]; u8 *key, *ptr; int ret, len; bool issig; ret = software_key_determine_akcipher(pkey, params->encoding, params->hash_algo, alg_name, &issig, kernel_pkey_sign); if (ret < 0) return ret; key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) return -ENOMEM; memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); memset(info, 0, sizeof(*info)); if (issig) { struct crypto_sig *sig; sig = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(sig)) { ret = PTR_ERR(sig); goto error_free_key; } if (pkey->key_is_private) ret = crypto_sig_set_privkey(sig, key, pkey->keylen); else ret = crypto_sig_set_pubkey(sig, key, pkey->keylen); if (ret < 0) goto error_free_sig; len = crypto_sig_keysize(sig); info->key_size = len; info->max_sig_size = crypto_sig_maxsize(sig); info->max_data_size = crypto_sig_digestsize(sig); info->supported_ops = KEYCTL_SUPPORTS_VERIFY; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_SIGN; if (strcmp(params->encoding, "pkcs1") == 0) { info->max_enc_size = len / BITS_PER_BYTE; info->max_dec_size = len / BITS_PER_BYTE; info->supported_ops |= KEYCTL_SUPPORTS_ENCRYPT; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_DECRYPT; } error_free_sig: crypto_free_sig(sig); } else { struct crypto_akcipher *tfm; tfm = crypto_alloc_akcipher(alg_name, 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto error_free_key; } if (pkey->key_is_private) ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen); else ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen); if (ret < 0) goto error_free_akcipher; len = crypto_akcipher_maxsize(tfm); info->key_size = len * BITS_PER_BYTE; info->max_sig_size = len; info->max_data_size = len; info->max_enc_size = len; info->max_dec_size = len; info->supported_ops = KEYCTL_SUPPORTS_ENCRYPT; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_DECRYPT; error_free_akcipher: crypto_free_akcipher(tfm); } error_free_key: kfree_sensitive(key); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Do encryption, decryption and signing ops. */ static int software_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct public_key *pkey = params->key->payload.data[asym_crypto]; char alg_name[CRYPTO_MAX_ALG_NAME]; struct crypto_akcipher *tfm; struct crypto_sig *sig; char *key, *ptr; bool issig; int ret; pr_devel("==>%s()\n", __func__); ret = software_key_determine_akcipher(pkey, params->encoding, params->hash_algo, alg_name, &issig, params->op); if (ret < 0) return ret; key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) return -ENOMEM; memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); if (issig) { sig = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(sig)) { ret = PTR_ERR(sig); goto error_free_key; } if (pkey->key_is_private) ret = crypto_sig_set_privkey(sig, key, pkey->keylen); else ret = crypto_sig_set_pubkey(sig, key, pkey->keylen); if (ret) goto error_free_tfm; } else { tfm = crypto_alloc_akcipher(alg_name, 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto error_free_key; } if (pkey->key_is_private) ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen); else ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen); if (ret) goto error_free_tfm; } ret = -EINVAL; /* Perform the encryption calculation. */ switch (params->op) { case kernel_pkey_encrypt: if (issig) break; ret = crypto_akcipher_sync_encrypt(tfm, in, params->in_len, out, params->out_len); break; case kernel_pkey_decrypt: if (issig) break; ret = crypto_akcipher_sync_decrypt(tfm, in, params->in_len, out, params->out_len); break; case kernel_pkey_sign: if (!issig) break; ret = crypto_sig_sign(sig, in, params->in_len, out, params->out_len); break; default: BUG(); } if (!issig && ret == 0) ret = crypto_akcipher_maxsize(tfm); error_free_tfm: if (issig) crypto_free_sig(sig); else crypto_free_akcipher(tfm); error_free_key: kfree_sensitive(key); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Verify a signature using a public key. */ int public_key_verify_signature(const struct public_key *pkey, const struct public_key_signature *sig) { char alg_name[CRYPTO_MAX_ALG_NAME]; struct crypto_sig *tfm; char *key, *ptr; bool issig; int ret; pr_devel("==>%s()\n", __func__); BUG_ON(!pkey); BUG_ON(!sig); BUG_ON(!sig->s); /* * If the signature specifies a public key algorithm, it *must* match * the key's actual public key algorithm. * * Small exception: ECDSA signatures don't specify the curve, but ECDSA * keys do. So the strings can mismatch slightly in that case: * "ecdsa-nist-*" for the key, but "ecdsa" for the signature. */ if (sig->pkey_algo) { if (strcmp(pkey->pkey_algo, sig->pkey_algo) != 0 && (strncmp(pkey->pkey_algo, "ecdsa-", 6) != 0 || strcmp(sig->pkey_algo, "ecdsa") != 0)) return -EKEYREJECTED; } ret = software_key_determine_akcipher(pkey, sig->encoding, sig->hash_algo, alg_name, &issig, kernel_pkey_verify); if (ret < 0) return ret; tfm = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) { ret = -ENOMEM; goto error_free_tfm; } memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); if (pkey->key_is_private) ret = crypto_sig_set_privkey(tfm, key, pkey->keylen); else ret = crypto_sig_set_pubkey(tfm, key, pkey->keylen); if (ret) goto error_free_key; ret = crypto_sig_verify(tfm, sig->s, sig->s_size, sig->m, sig->m_size); error_free_key: kfree_sensitive(key); error_free_tfm: crypto_free_sig(tfm); pr_devel("<==%s() = %d\n", __func__, ret); if (WARN_ON_ONCE(ret > 0)) ret = -EINVAL; return ret; } EXPORT_SYMBOL_GPL(public_key_verify_signature); static int public_key_verify_signature_2(const struct key *key, const struct public_key_signature *sig) { const struct public_key *pk = key->payload.data[asym_crypto]; return public_key_verify_signature(pk, sig); } /* * Public key algorithm asymmetric key subtype */ struct asymmetric_key_subtype public_key_subtype = { .owner = THIS_MODULE, .name = "public_key", .name_len = sizeof("public_key") - 1, .describe = public_key_describe, .destroy = public_key_destroy, .query = software_key_query, .eds_op = software_key_eds_op, .verify_signature = public_key_verify_signature_2, }; EXPORT_SYMBOL_GPL(public_key_subtype);
12 8 4 2 10 10 1 3 9 3 9 12 12 12 12 12 12 4 8 449 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, <jlan@sgi.com> */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> #include <linux/mm.h> /* * fill in basic accounting fields */ void bacct_add_tsk(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ now_ns = ktime_get_ns(); /* store whole group time first */ delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_tgetime = delta; delta = now_ns - tsk->start_time; do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) stats->ac_flag |= ACORE; if (tsk->flags & PF_SIGNALED) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); stats->ac_utime = div_u64(utime, NSEC_PER_USEC); stats->ac_stime = div_u64(stime, NSEC_PER_USEC); task_cputime_scaled(tsk, &utimescaled, &stimescaled); stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC); stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; strscpy_pad(stats->ac_comm, tsk->comm); } #ifdef CONFIG_TASK_XACCT #define KB 1024 #define MB (1024*KB) #define KB_MASK (~(KB-1)) /* * fill in extended accounting fields */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; do_div(stats->coremem, 1000 * KB); stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar & KB_MASK; stats->write_char = p->ioac.wchar & KB_MASK; stats->read_syscalls = p->ioac.syscr & KB_MASK; stats->write_syscalls = p->ioac.syscw & KB_MASK; #ifdef CONFIG_TASK_IO_ACCOUNTING stats->read_bytes = p->ioac.read_bytes & KB_MASK; stats->write_bytes = p->ioac.write_bytes & KB_MASK; stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; #else stats->read_bytes = 0; stats->write_bytes = 0; stats->cancelled_write_bytes = 0; #endif } #undef KB #undef MB static void __acct_update_integrals(struct task_struct *tsk, u64 utime, u64 stime) { u64 time, delta; if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD))) return; time = stime + utime; delta = time - tsk->acct_timexpd; if (delta < TICK_NSEC) return; tsk->acct_timexpd = time; /* * Divide by 1024 to avoid overflow, and to avoid division. * The final unit reported to userspace is Mbyte-usecs, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** * acct_update_integrals - update mm integral fields in task_struct * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { u64 utime, stime; unsigned long flags; local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); local_irq_restore(flags); } /** * acct_account_cputime - update mm integral after cputime update * @tsk: task_struct for accounting */ void acct_account_cputime(struct task_struct *tsk) { __acct_update_integrals(tsk, tsk->utime, tsk->stime); } /** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { tsk->acct_timexpd = 0; tsk->acct_rss_mem1 = 0; tsk->acct_vm_mem1 = 0; } #endif
63 21 2 256 1490 10885 1494 11130 262 1491 11385 11238 11111 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <linux/rseq_types.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> #include <linux/time64.h> #ifndef COMPILE_OFFSETS #include <generated/rq-offsets.h> #endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct timespec64; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; /* Timestamp of max time spent waiting on a runqueue: */ struct timespec64 max_run_delay_ts; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; u64 max_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; /* Approximated virtual lag: */ s64 vlag; /* 'Protected' deadline, to give out minimum quantums: */ u64 vprot; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct rq_flags; typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. * * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for */ struct rq *rq; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #ifdef CONFIG_CFS_BANDWIDTH struct callback_head sched_throttle_work; struct list_head throttle_node; bool throttled; #endif #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; struct srcu_ctr __percpu *trc_reader_scp; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; struct io_restriction *io_uring_restrict; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #ifdef CONFIG_PREEMPT_RT struct llist_node cg_dead_lnode; #endif /* CONFIG_PREEMPT_RT */ #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE struct lazy_mmu_state lazy_mmu_state; #endif #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE /** * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task. * @tsk: The task to check. * * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled * and not paused). * * This function only considers the state saved in task_struct; to test whether * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be * used instead. * * This function is intended for architectures that implement the lazy MMU * mode; it must not be called from generic code. */ static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk) { struct lazy_mmu_state *state = &tsk->lazy_mmu_state; return state->enable_count > 0 && state->pause_count == 0; } /** * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode. * * Test whether the current context is in lazy MMU mode. This is true if both: * 1. We are not in interrupt context * 2. Lazy MMU mode is active for the current task * * This function is intended for architectures that implement the lazy MMU * mode; it must not be called from generic code. */ static inline bool is_lazy_mmu_mode_active(void) { if (in_interrupt()) return false; return __task_lazy_mmu_mode_active(current); } #endif extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } static __always_inline bool is_user_task(struct task_struct *task) { return task->mm && !(task->flags & (PF_KTHREAD | PF_USER_WORKER)); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } static inline void set_need_resched_current(void) { lockdep_assert_irqs_disabled(); set_tsk_need_resched(current); set_preempt_need_resched(); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock) __must_hold(lock); extern int __cond_resched_rwlock_read(rwlock_t *lock) __must_hold_shared(lock); extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif /* Avoids recursive inclusion hell */ #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return task_cpu(t); } #endif #ifndef MODULE #ifndef COMPILE_OFFSETS extern void ___migrate_enable(void); struct rq; DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* * The "struct rq" is not available here, so we can't access the * "runqueues" with this_cpu_ptr(), as the compilation will fail in * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): * typeof((ptr) + 0) * * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ #ifdef CONFIG_SMP #define this_rq_raw() arch_raw_cpu_ptr(&runqueues) #else #define this_rq_raw() PERCPU_PTR(&runqueues) #endif #define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) static inline void __migrate_enable(void) { struct task_struct *p = current; #ifdef CONFIG_DEBUG_PREEMPT /* * Check both overflow from migrate_disable() and superfluous * migrate_enable(). */ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) return; #endif if (p->migration_disabled > 1) { p->migration_disabled--; return; } /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ guard(preempt)(); if (unlikely(p->cpus_ptr != &p->cpus_mask)) ___migrate_enable(); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused. */ barrier(); p->migration_disabled = 0; this_rq_pinned()--; } static inline void __migrate_disable(void) { struct task_struct *p = current; if (p->migration_disabled) { #ifdef CONFIG_DEBUG_PREEMPT /* *Warn about overflow half-way through the range. */ WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif p->migration_disabled++; return; } guard(preempt)(); this_rq_pinned()++; p->migration_disabled = 1; } #else /* !COMPILE_OFFSETS */ static inline void __migrate_disable(void) { } static inline void __migrate_enable(void) { } #endif /* !COMPILE_OFFSETS */ /* * So that it is possible to not export the runqueues variable, define and * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will * be defined in kernel/sched/core.c. */ #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE static __always_inline void migrate_disable(void) { __migrate_disable(); } static __always_inline void migrate_enable(void) { __migrate_enable(); } #else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ #else /* MODULE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* MODULE */ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif
55