Total coverage: 142729 (8%)of 1820961
1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* SPDX-License-Identifier: GPL-2.0 */ /* * Vxlan private header file * */ #ifndef _VXLAN_PRIVATE_H #define _VXLAN_PRIVATE_H #include <linux/rhashtable.h> extern unsigned int vxlan_net_id; extern const u8 all_zeros_mac[ETH_ALEN + 2]; extern const struct rhashtable_params vxlan_vni_rht_params; #define PORT_HASH_BITS 8 #define PORT_HASH_SIZE (1 << PORT_HASH_BITS) /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; struct hlist_head sock_list[PORT_HASH_SIZE]; spinlock_t sock_lock; struct notifier_block nexthop_notifier_block; }; /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; u8 eth_addr[ETH_ALEN]; u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) { return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; } /* Socket hash table head */ static inline struct hlist_head *vs_head(struct net *net, __be16 port) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } /* First remote destination for a forwarding entry. * Guaranteed to be non-NULL because remotes are never deleted. */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { if (a->sa.sa_family != b->sa.sa_family) return false; if (a->sa.sa_family == AF_INET6) return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); else return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { ip->sin6.sin6_addr = nla_get_in6_addr(nla); ip->sa.sa_family = AF_INET6; return 0; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr); else return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ip->sin6.sin6_addr); else return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #else /* !CONFIG_IPV6 */ static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { return -EAFNOSUPPORT; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #endif static inline size_t vxlan_addr_size(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return sizeof(struct in6_addr); else return sizeof(__be32); } static inline struct vxlan_vni_node * vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_vni_group *vg; vg = rcu_dereference_rtnl(vxlan->vnigrp); if (!vg) return NULL; return rhashtable_lookup_fast(&vg->vni_hash, &vni, vxlan_vni_rht_params); } /* vxlan_core.c */ int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack); int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify); u32 eth_vni_hash(const unsigned char *addr, __be32 vni); u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni); int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack); void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc); int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni); /* vxlan_vnifilter.c */ int vxlan_vnigroup_init(struct vxlan_dev *vxlan); void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan); int vxlan_vnifilter_init(void); void vxlan_vnifilter_uninit(void); void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, int type, unsigned int len); void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, struct vxlan_sock *vs, bool ipv6); void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan); int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, union vxlan_addr *old_remote_ip, union vxlan_addr *new_remote_ip, struct netlink_ext_ack *extack); /* vxlan_multicast.c */ int vxlan_multicast_join(struct vxlan_dev *vxlan); int vxlan_multicast_leave(struct vxlan_dev *vxlan); bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, __be32 vni, union vxlan_addr *rip, int rifindex); int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); /* vxlan_mdb.c */ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int vxlan_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int vxlan_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); struct vxlan_mdb_entry *vxlan_mdb_entry_skb_get(struct vxlan_dev *vxlan, struct sk_buff *skb, __be32 src_vni); netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, struct sk_buff *skb); int vxlan_mdb_init(struct vxlan_dev *vxlan); void vxlan_mdb_fini(struct vxlan_dev *vxlan); #endif
5 1 3 2 2 4 9 3 8 2 4 3 10 10 10 10 7 3 10 12 3 13 10 10 10 3 3 5 2 3 3 3 5 1 3 1 3 1 1 3 1 2 2 1 2 2 1 2 3 3 6 1 4 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 // SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); WRITE_ONCE(q->quantum, new_quantum); WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) WRITE_ONCE(q->hh_flows_limit, nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); WRITE_ONCE(q->hhf_reset_timeout, usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) WRITE_ONCE(q->hhf_admit_bytes, nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); WRITE_ONCE(q->hhf_evict_timeout, usecs_to_jiffies(us)); } qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, prev_backlog - sch->qstats.backlog); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SM4 Cipher Algorithm, AES-NI/AVX optimized. * as specified in * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html * * Copyright (c) 2021, Alibaba Group. * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> #include <asm/simd.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" #define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return sm4_expandkey(ctx, key, key_len); } static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) { struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { sm4_aesni_avx_crypt8(rkey, dst, src, 8); dst += SM4_CRYPT8_BLOCK_SIZE; src += SM4_CRYPT8_BLOCK_SIZE; nbytes -= SM4_CRYPT8_BLOCK_SIZE; } while (nbytes >= SM4_BLOCK_SIZE) { unsigned int nblocks = min(nbytes >> 4, 4u); sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } int sm4_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_enc); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt); int sm4_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_dec); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt); int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *iv = walk.iv; const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; while (nbytes >= SM4_BLOCK_SIZE) { crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, dst, dst); iv = dst; src += SM4_BLOCK_SIZE; dst += SM4_BLOCK_SIZE; nbytes -= SM4_BLOCK_SIZE; } if (iv != walk.iv) memcpy(walk.iv, iv, SM4_BLOCK_SIZE); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_cbc_encrypt); int sm4_avx_cbc_decrypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_dec, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; u8 iv[SM4_BLOCK_SIZE]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, src, nblocks); src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; dst += (nblocks - 1) * SM4_BLOCK_SIZE; memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); for (i = nblocks - 1; i > 0; i--) { crypto_xor_cpy(dst, src, &keystream[i * SM4_BLOCK_SIZE], SM4_BLOCK_SIZE); src -= SM4_BLOCK_SIZE; dst -= SM4_BLOCK_SIZE; } crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); memcpy(walk.iv, iv, SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += (nblocks + 1) * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); static int cbc_decrypt(struct skcipher_request *req) { return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_cbc_dec_blk8); } int sm4_avx_ctr_crypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_enc, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; for (i = 0; i < nblocks; i++) { memcpy(&keystream[i * SM4_BLOCK_SIZE], walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); } sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, keystream, nblocks); crypto_xor_cpy(dst, src, keystream, nblocks * SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); /* tail */ if (walk.nbytes == walk.total && nbytes > 0) { u8 keystream[SM4_BLOCK_SIZE]; memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, keystream, keystream); crypto_xor_cpy(dst, src, keystream, nbytes); dst += nbytes; src += nbytes; nbytes = 0; } err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); static int ctr_crypt(struct skcipher_request *req) { return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_ctr_enc_blk8); } static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { .cra_name = "__ecb(sm4)", .cra_driver_name = "__ecb-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_avx_ecb_encrypt, .decrypt = sm4_avx_ecb_decrypt, }, { .base = { .cra_name = "__cbc(sm4)", .cra_driver_name = "__cbc-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { .cra_name = "__ctr(sm4)", .cra_driver_name = "__ctr-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .chunksize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, } }; static struct simd_skcipher_alg * simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; static int __init sm4_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers), simd_sm4_aesni_avx_skciphers); } static void __exit sm4_exit(void) { simd_unregister_skciphers(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers), simd_sm4_aesni_avx_skciphers); } module_init(sm4_init); module_exit(sm4_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); MODULE_ALIAS_CRYPTO("sm4"); MODULE_ALIAS_CRYPTO("sm4-aesni-avx");
15 1 1 1 6 6 8 1 3 6 1 2 2 1 1 1 7 1 5 1 4 2 4 2 5 1 5 1 6 6 6 5 5 5 5 5 5 5 4 1 1 5 5 5 2 1 6 6 1 1 1 1 1 1 3 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/dsfield.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> static struct tc_action_ops act_skbedit_ops; static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, struct sk_buff *skb) { u16 queue_mapping = params->queue_mapping; if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { u32 hash = skb_get_hash(skb); queue_mapping += hash % params->mapping_mod; } return netdev_cap_txqueue(skb->dev, queue_mapping); } TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; break; } } if (params->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > params->queue_mapping) { #ifdef CONFIG_NET_EGRESS netdev_xmit_skip_txqueue(true); #endif skb_set_queue_mapping(skb, tcf_skbedit_hash(params, skb)); } if (params->flags & SKBEDIT_F_MARK) { skb->mark &= ~params->mask; skb->mark |= params->mark & params->mask; } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; return action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); return TC_ACT_SHOT; } static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, [TCA_SKBEDIT_QUEUE_MAPPING_MAX] = { .len = sizeof(u16) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; u16 mapping_mod = 1; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL); if (err < 0) return err; if (tb[TCA_SKBEDIT_PARMS] == NULL) return -EINVAL; if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { flags |= SKBEDIT_F_PRIORITY; priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); } if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { if (is_tcf_skbedit_ingress(act_flags) && !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) { NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw"); return -EOPNOTSUPP; } flags |= SKBEDIT_F_QUEUE_MAPPING; queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } if (tb[TCA_SKBEDIT_PTYPE] != NULL) { ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); if (!skb_pkt_type_ok(*ptype)) return -EINVAL; flags |= SKBEDIT_F_PTYPE; } if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); } if (tb[TCA_SKBEDIT_MASK] != NULL) { flags |= SKBEDIT_F_MASK; mask = nla_data(tb[TCA_SKBEDIT_MASK]); } if (tb[TCA_SKBEDIT_FLAGS] != NULL) { u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); if (*pure_flags & SKBEDIT_F_TXQ_SKBHASH) { u16 *queue_mapping_max; if (!tb[TCA_SKBEDIT_QUEUE_MAPPING] || !tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]) { NL_SET_ERR_MSG_MOD(extack, "Missing required range of queue_mapping."); return -EINVAL; } queue_mapping_max = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]); if (*queue_mapping_max < *queue_mapping) { NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid, max < min."); return -EINVAL; } mapping_mod = *queue_mapping_max - *queue_mapping + 1; flags |= SKBEDIT_F_TXQ_SKBHASH; } if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) flags |= SKBEDIT_F_INHERITDSFIELD; } parm = nla_data(tb[TCA_SKBEDIT_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!flags) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_skbedit_ops, bind, true, act_flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_skbedit(*a); ret = ACT_P_CREATED; } else { d = to_skbedit(*a); if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; } params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) { params_new->queue_mapping = *queue_mapping; params_new->mapping_mod = mapping_mod; } if (flags & SKBEDIT_F_MARK) params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) params_new->ptype = *ptype; /* default behaviour is to use all the bits */ params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; u64 pure_flags = 0; struct tcf_t t; spin_lock_bh(&d->tcf_lock); params = rcu_dereference_protected(d->params, lockdep_is_held(&d->tcf_lock)); opt.action = d->tcf_action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PRIORITY) && nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MARK) && nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PTYPE) && nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { if (nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING_MAX, params->queue_mapping + params->mapping_mod - 1)) goto nla_put_failure; pure_flags |= SKBEDIT_F_TXQ_SKBHASH; } if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_skbedit_cleanup(struct tc_action *a) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; params = rcu_dereference_protected(d->params, 1); if (params) kfree_rcu(params, rcu); } static size_t tcf_skbedit_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_skbedit)) + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING_MAX */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */ + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */ } static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_skbedit_mark(act)) { entry->id = FLOW_ACTION_MARK; entry->mark = tcf_skbedit_mark(act); } else if (is_tcf_skbedit_ptype(act)) { entry->id = FLOW_ACTION_PTYPE; entry->ptype = tcf_skbedit_ptype(act); } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); } else if (is_tcf_skbedit_tx_queue_mapping(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side"); return -EOPNOTSUPP; } else if (is_tcf_skbedit_rx_queue_mapping(act)) { entry->id = FLOW_ACTION_RX_QUEUE_MAPPING; entry->rx_queue = tcf_skbedit_rx_queue_mapping(act); } else if (is_tcf_skbedit_inheritdsfield(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used"); return -EOPNOTSUPP; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported skbedit option offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_skbedit_mark(act)) fl_action->id = FLOW_ACTION_MARK; else if (is_tcf_skbedit_ptype(act)) fl_action->id = FLOW_ACTION_PTYPE; else if (is_tcf_skbedit_priority(act)) fl_action->id = FLOW_ACTION_PRIORITY; else if (is_tcf_skbedit_rx_queue_mapping(act)) fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, .stats_update = tcf_skbedit_stats_update, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, .get_fill_size = tcf_skbedit_get_fill_size, .offload_act_setup = tcf_skbedit_offload_act_setup, .size = sizeof(struct tcf_skbedit), }; MODULE_ALIAS_NET_ACT("skbedit"); static __net_init int skbedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); return tc_action_net_init(net, tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_skbedit_ops.net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, .exit_batch = skbedit_exit_net, .id = &act_skbedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); MODULE_DESCRIPTION("SKB Editing"); MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); module_exit(skbedit_cleanup_module);
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> #include <linux/poll.h> #include <linux/cgroup-defs.h> #include <linux/cgroup.h> struct seq_file; struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; extern struct psi_group psi_system; void psi_init(void); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, enum psi_res res, struct file *file, struct kernfs_open_file *of); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; } int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} #ifdef CONFIG_CGROUPS static inline int psi_cgroup_alloc(struct cgroup *cgrp) { return 0; } static inline void psi_cgroup_free(struct cgroup *cgrp) { } static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */
3 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_RED_H #define __NET_SCHED_RED_H #include <linux/types.h> #include <linux/bug.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/dsfield.h> #include <linux/reciprocal_div.h> /* Random Early Detection (RED) algorithm. ======================================= Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. This file codes a "divisionless" version of RED algorithm as written down in Fig.17 of the paper. Short description. ------------------ When a new packet arrives we calculate the average queue length: avg = (1-W)*avg + W*current_queue_len, W is the filter time constant (chosen as 2^(-Wlog)), it controls the inertia of the algorithm. To allow larger bursts, W should be decreased. if (avg > th_max) -> packet marked (dropped). if (avg < th_min) -> packet passes. if (th_min < avg < th_max) we calculate probability: Pb = max_P * (avg - th_min)/(th_max-th_min) and mark (drop) packet with this probability. Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). max_P should be small (not 1), usually 0.01..0.02 is good value. max_P is chosen as a number, so that max_P/(th_max-th_min) is a negative power of two in order arithmetic to contain only shifts. Parameters, settable by user: ----------------------------- qth_min - bytes (should be < qth_max/2) qth_max - bytes (should be at least 2*qth_min and less limit) Wlog - bits (<32) log(1/W). Plog - bits (<32) Plog is related to max_P by formula: max_P = (qth_max-qth_min)/2^Plog; F.e. if qth_max=128K and qth_min=32K, then Plog=22 corresponds to max_P=0.02 Scell_log Stab Lookup table for log((1-W)^(t/t_ave). NOTES: Upper bound on W. ----------------- If you want to allow bursts of L packets of size S, you should choose W: L + 1 - th_min/S < (1-(1-W)^L)/W th_min/S = 32 th_min/S = 4 log(W) L -1 33 -2 35 -3 39 -4 46 -5 57 -6 75 -7 101 -8 135 -9 190 etc. */ /* * Adaptative RED : An Algorithm for Increasing the Robustness of RED's AQM * (Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker) August 2001 * * Every 500 ms: * if (avg > target and max_p <= 0.5) * increase max_p : max_p += alpha; * else if (avg < target and max_p >= 0.01) * decrease max_p : max_p *= beta; * * target :[qth_min + 0.4*(qth_min - qth_max), * qth_min + 0.6*(qth_min - qth_max)]. * alpha : min(0.01, max_p / 4) * beta : 0.9 * max_P is a Q0.32 fixed point number (with 32 bits mantissa) * max_P between 0.01 and 0.5 (1% - 50%) [ Its no longer a negative power of two ] */ #define RED_ONE_PERCENT ((u32)DIV_ROUND_CLOSEST(1ULL<<32, 100)) #define MAX_P_MIN (1 * RED_ONE_PERCENT) #define MAX_P_MAX (50 * RED_ONE_PERCENT) #define MAX_P_ALPHA(val) min(MAX_P_MIN, val / 4) #define RED_STAB_SIZE 256 #define RED_STAB_MASK (RED_STAB_SIZE - 1) struct red_stats { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ }; struct red_parms { /* Parameters */ u32 qth_min; /* Min avg length threshold: Wlog scaled */ u32 qth_max; /* Max avg length threshold: Wlog scaled */ u32 Scell_max; u32 max_P; /* probability, [0 .. 1.0] 32 scaled */ /* reciprocal_value(max_P / qth_delta) */ struct reciprocal_value max_P_reciprocal; u32 qth_delta; /* max_th - min_th */ u32 target_min; /* min_th + 0.4*(max_th - min_th) */ u32 target_max; /* min_th + 0.6*(max_th - min_th) */ u8 Scell_log; u8 Wlog; /* log(W) */ u8 Plog; /* random number bits */ u8 Stab[RED_STAB_SIZE]; }; struct red_vars { /* Variables */ int qcount; /* Number of packets since last random number generation */ u32 qR; /* Cached random number */ unsigned long qavg; /* Average queue length: Wlog scaled */ ktime_t qidlestart; /* Start of current idle period */ }; static inline u32 red_maxp(u8 Plog) { return Plog < 32 ? (~0U >> Plog) : ~0U; } static inline void red_set_vars(struct red_vars *v) { /* Reset average queue length, the value is strictly bound * to the parameters below, resetting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR */ v->qavg = 0; v->qcount = -1; } static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log, u8 *stab) { if (fls(qth_min) + Wlog >= 32) return false; if (fls(qth_max) + Wlog >= 32) return false; if (Scell_log >= 32) return false; if (qth_max < qth_min) return false; if (stab) { int i; for (i = 0; i < RED_STAB_SIZE; i++) if (stab[i] >= 32) return false; } return true; } static inline int red_get_flags(unsigned char qopt_flags, unsigned char historic_mask, struct nlattr *flags_attr, unsigned char supported_mask, struct nla_bitfield32 *p_flags, unsigned char *p_userbits, struct netlink_ext_ack *extack) { struct nla_bitfield32 flags; if (qopt_flags && flags_attr) { NL_SET_ERR_MSG_MOD(extack, "flags should be passed either through qopt, or through a dedicated attribute"); return -EINVAL; } if (flags_attr) { flags = nla_get_bitfield32(flags_attr); } else { flags.selector = historic_mask; flags.value = qopt_flags & historic_mask; } *p_flags = flags; *p_userbits = qopt_flags & ~historic_mask; return 0; } static inline int red_validate_flags(unsigned char flags, struct netlink_ext_ack *extack) { if ((flags & TC_RED_NODROP) && !(flags & TC_RED_ECN)) { NL_SET_ERR_MSG_MOD(extack, "nodrop mode is only meaningful with ECN"); return -EINVAL; } return 0; } static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) { int delta = qth_max - qth_min; u32 max_p_delta; WRITE_ONCE(p->qth_min, qth_min << Wlog); WRITE_ONCE(p->qth_max, qth_max << Wlog); WRITE_ONCE(p->Wlog, Wlog); WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; if (!max_P) { max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); /* RED Adaptative target : * [min_th + 0.4*(min_th - max_th), * min_th + 0.6*(min_th - max_th)]. */ delta /= 5; p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) memcpy(p->Stab, stab, sizeof(p->Stab)); } static inline int red_is_idling(const struct red_vars *v) { return v->qidlestart != 0; } static inline void red_start_of_idle_period(struct red_vars *v) { v->qidlestart = ktime_get(); } static inline void red_end_of_idle_period(struct red_vars *v) { v->qidlestart = 0; } static inline void red_restart(struct red_vars *v) { red_end_of_idle_period(v); v->qavg = 0; v->qcount = -1; } static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p, const struct red_vars *v) { s64 delta = ktime_us_delta(ktime_get(), v->qidlestart); long us_idle = min_t(s64, delta, p->Scell_max); int shift; /* * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. * * SF+VJ proposed to "generate": * * m = idletime / (average_pkt_size / bandwidth) * * dummy packets as a burst after idle time, i.e. * * v->qavg *= (1-W)^m * * This is an apparently overcomplicated solution (f.e. we have to * precompute a table to make this calculation in reasonable time) * I believe that a simpler model may be used here, * but it is field for experiments. */ shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK]; if (shift) return v->qavg >> shift; else { /* Approximate initial part of exponent with linear function: * * (1-W)^m ~= 1-mW + ... * * Seems, it is the best solution to * problem of too coarse exponent tabulation. */ us_idle = (v->qavg * (u64)us_idle) >> p->Scell_log; if (us_idle < (v->qavg >> 1)) return v->qavg - us_idle; else return v->qavg >> 1; } } static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { /* * NOTE: v->qavg is fixed point number with point at Wlog. * The formula below is equivalent to floating point * version: * * qavg = qavg*(1-W) + backlog*W; * * --ANK (980924) */ return v->qavg + (backlog - (v->qavg >> p->Wlog)); } static inline unsigned long red_calc_qavg(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { if (!red_is_idling(v)) return red_calc_qavg_no_idle_time(p, v, backlog); else return red_calc_qavg_from_idle_time(p, v); } static inline u32 red_random(const struct red_parms *p) { return reciprocal_divide(get_random_u32(), p->max_P_reciprocal); } static inline int red_mark_probability(const struct red_parms *p, const struct red_vars *v, unsigned long qavg) { /* The formula used below causes questions. OK. qR is random number in the interval (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point arithmetic, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed point at Wlog, two lines below have the following floating point equivalent: max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount Any questions? --ANK (980924) */ return !(((qavg - p->qth_min) >> p->Wlog) * v->qcount < v->qR); } enum { RED_BELOW_MIN_THRESH, RED_BETWEEN_TRESH, RED_ABOVE_MAX_TRESH, }; static inline int red_cmp_thresh(const struct red_parms *p, unsigned long qavg) { if (qavg < p->qth_min) return RED_BELOW_MIN_THRESH; else if (qavg >= p->qth_max) return RED_ABOVE_MAX_TRESH; else return RED_BETWEEN_TRESH; } enum { RED_DONT_MARK, RED_PROB_MARK, RED_HARD_MARK, }; static inline int red_action(const struct red_parms *p, struct red_vars *v, unsigned long qavg) { switch (red_cmp_thresh(p, qavg)) { case RED_BELOW_MIN_THRESH: v->qcount = -1; return RED_DONT_MARK; case RED_BETWEEN_TRESH: if (++v->qcount) { if (red_mark_probability(p, v, qavg)) { v->qcount = 0; v->qR = red_random(p); return RED_PROB_MARK; } } else v->qR = red_random(p); return RED_DONT_MARK; case RED_ABOVE_MAX_TRESH: v->qcount = -1; return RED_HARD_MARK; } BUG(); return RED_DONT_MARK; } static inline void red_adaptative_algo(struct red_parms *p, struct red_vars *v) { unsigned long qavg; u32 max_p_delta; qavg = v->qavg; if (red_is_idling(v)) qavg = red_calc_qavg_from_idle_time(p, v); /* v->qavg is fixed point number with point at Wlog */ qavg >>= p->Wlog; if (qavg > p->target_max && p->max_P <= MAX_P_MAX) p->max_P += MAX_P_ALPHA(p->max_P); /* maxp = maxp + alpha */ else if (qavg < p->target_min && p->max_P >= MAX_P_MIN) p->max_P = (p->max_P/10)*9; /* maxp = maxp * Beta */ max_p_delta = DIV_ROUND_CLOSEST(p->max_P, p->qth_delta); max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); } #endif
38 1 19 18 1 2 2 34 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 7 1 3 1 6 2 7 1 22 7 21 8 24 4 17 16 2 16 2 20 6 12 2 7 2 5 24 2 13 10 1 1 13 12 7 3 17 23 7 36 3 26 7 32 8 8 8 8 29 3 25 7 7 26 5 22 2 2 1 4 8 6 1 19 1 1 27 11 33 26 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "netlink.h" #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include <uapi/linux/wireguard.h> #include <linux/if.h> #include <net/genetlink.h> #include <net/sock.h> #include <crypto/utils.h> static struct genl_family genl_family; static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = { [WGDEVICE_A_IFINDEX] = { .type = NLA_U32 }, [WGDEVICE_A_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [WGDEVICE_A_PRIVATE_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_FLAGS] = { .type = NLA_U32 }, [WGDEVICE_A_LISTEN_PORT] = { .type = NLA_U16 }, [WGDEVICE_A_FWMARK] = { .type = NLA_U32 }, [WGDEVICE_A_PEERS] = { .type = NLA_NESTED } }; static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = { [WGPEER_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGPEER_A_PRESHARED_KEY] = NLA_POLICY_EXACT_LEN(NOISE_SYMMETRIC_KEY_LEN), [WGPEER_A_FLAGS] = { .type = NLA_U32 }, [WGPEER_A_ENDPOINT] = NLA_POLICY_MIN_LEN(sizeof(struct sockaddr)), [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL] = { .type = NLA_U16 }, [WGPEER_A_LAST_HANDSHAKE_TIME] = NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)), [WGPEER_A_RX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_TX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_ALLOWEDIPS] = { .type = NLA_NESTED }, [WGPEER_A_PROTOCOL_VERSION] = { .type = NLA_U32 } }; static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = { [WGALLOWEDIP_A_FAMILY] = { .type = NLA_U16 }, [WGALLOWEDIP_A_IPADDR] = NLA_POLICY_MIN_LEN(sizeof(struct in_addr)), [WGALLOWEDIP_A_CIDR_MASK] = { .type = NLA_U8 } }; static struct wg_device *lookup_interface(struct nlattr **attrs, struct sk_buff *skb) { struct net_device *dev = NULL; if (!attrs[WGDEVICE_A_IFINDEX] == !attrs[WGDEVICE_A_IFNAME]) return ERR_PTR(-EBADR); if (attrs[WGDEVICE_A_IFINDEX]) dev = dev_get_by_index(sock_net(skb->sk), nla_get_u32(attrs[WGDEVICE_A_IFINDEX])); else if (attrs[WGDEVICE_A_IFNAME]) dev = dev_get_by_name(sock_net(skb->sk), nla_data(attrs[WGDEVICE_A_IFNAME])); if (!dev) return ERR_PTR(-ENODEV); if (!dev->rtnl_link_ops || !dev->rtnl_link_ops->kind || strcmp(dev->rtnl_link_ops->kind, KBUILD_MODNAME)) { dev_put(dev); return ERR_PTR(-EOPNOTSUPP); } return netdev_priv(dev); } static int get_allowedips(struct sk_buff *skb, const u8 *ip, u8 cidr, int family) { struct nlattr *allowedip_nest; allowedip_nest = nla_nest_start(skb, 0); if (!allowedip_nest) return -EMSGSIZE; if (nla_put_u8(skb, WGALLOWEDIP_A_CIDR_MASK, cidr) || nla_put_u16(skb, WGALLOWEDIP_A_FAMILY, family) || nla_put(skb, WGALLOWEDIP_A_IPADDR, family == AF_INET6 ? sizeof(struct in6_addr) : sizeof(struct in_addr), ip)) { nla_nest_cancel(skb, allowedip_nest); return -EMSGSIZE; } nla_nest_end(skb, allowedip_nest); return 0; } struct dump_ctx { struct wg_device *wg; struct wg_peer *next_peer; u64 allowedips_seq; struct allowedips_node *next_allowedip; }; #define DUMP_CTX(cb) ((struct dump_ctx *)(cb)->args) static int get_peer(struct wg_peer *peer, struct sk_buff *skb, struct dump_ctx *ctx) { struct nlattr *allowedips_nest, *peer_nest = nla_nest_start(skb, 0); struct allowedips_node *allowedips_node = ctx->next_allowedip; bool fail; if (!peer_nest) return -EMSGSIZE; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, peer->handshake.remote_static); up_read(&peer->handshake.lock); if (fail) goto err; if (!allowedips_node) { const struct __kernel_timespec last_handshake = { .tv_sec = peer->walltime_last_handshake.tv_sec, .tv_nsec = peer->walltime_last_handshake.tv_nsec }; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PRESHARED_KEY, NOISE_SYMMETRIC_KEY_LEN, peer->handshake.preshared_key); up_read(&peer->handshake.lock); if (fail) goto err; if (nla_put(skb, WGPEER_A_LAST_HANDSHAKE_TIME, sizeof(last_handshake), &last_handshake) || nla_put_u16(skb, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, peer->persistent_keepalive_interval) || nla_put_u64_64bit(skb, WGPEER_A_TX_BYTES, peer->tx_bytes, WGPEER_A_UNSPEC) || nla_put_u64_64bit(skb, WGPEER_A_RX_BYTES, peer->rx_bytes, WGPEER_A_UNSPEC) || nla_put_u32(skb, WGPEER_A_PROTOCOL_VERSION, 1)) goto err; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr4), &peer->endpoint.addr4); else if (peer->endpoint.addr.sa_family == AF_INET6) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr6), &peer->endpoint.addr6); read_unlock_bh(&peer->endpoint_lock); if (fail) goto err; allowedips_node = list_first_entry_or_null(&peer->allowedips_list, struct allowedips_node, peer_list); } if (!allowedips_node) goto no_allowedips; if (!ctx->allowedips_seq) ctx->allowedips_seq = ctx->wg->peer_allowedips.seq; else if (ctx->allowedips_seq != ctx->wg->peer_allowedips.seq) goto no_allowedips; allowedips_nest = nla_nest_start(skb, WGPEER_A_ALLOWEDIPS); if (!allowedips_nest) goto err; list_for_each_entry_from(allowedips_node, &peer->allowedips_list, peer_list) { u8 cidr, ip[16] __aligned(__alignof(u64)); int family; family = wg_allowedips_read_node(allowedips_node, ip, &cidr); if (get_allowedips(skb, ip, cidr, family)) { nla_nest_end(skb, allowedips_nest); nla_nest_end(skb, peer_nest); ctx->next_allowedip = allowedips_node; return -EMSGSIZE; } } nla_nest_end(skb, allowedips_nest); no_allowedips: nla_nest_end(skb, peer_nest); ctx->next_allowedip = NULL; ctx->allowedips_seq = 0; return 0; err: nla_nest_cancel(skb, peer_nest); return -EMSGSIZE; } static int wg_get_device_start(struct netlink_callback *cb) { struct wg_device *wg; wg = lookup_interface(genl_info_dump(cb)->attrs, cb->skb); if (IS_ERR(wg)) return PTR_ERR(wg); DUMP_CTX(cb)->wg = wg; return 0; } static int wg_get_device_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct wg_peer *peer, *next_peer_cursor; struct dump_ctx *ctx = DUMP_CTX(cb); struct wg_device *wg = ctx->wg; struct nlattr *peers_nest; int ret = -EMSGSIZE; bool done = true; void *hdr; rtnl_lock(); mutex_lock(&wg->device_update_lock); cb->seq = wg->device_update_gen; next_peer_cursor = ctx->next_peer; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_family, NLM_F_MULTI, WG_CMD_GET_DEVICE); if (!hdr) goto out; genl_dump_check_consistent(cb, hdr); if (!ctx->next_peer) { if (nla_put_u16(skb, WGDEVICE_A_LISTEN_PORT, wg->incoming_port) || nla_put_u32(skb, WGDEVICE_A_FWMARK, wg->fwmark) || nla_put_u32(skb, WGDEVICE_A_IFINDEX, wg->dev->ifindex) || nla_put_string(skb, WGDEVICE_A_IFNAME, wg->dev->name)) goto out; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity) { if (nla_put(skb, WGDEVICE_A_PRIVATE_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_private) || nla_put(skb, WGDEVICE_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_public)) { up_read(&wg->static_identity.lock); goto out; } } up_read(&wg->static_identity.lock); } peers_nest = nla_nest_start(skb, WGDEVICE_A_PEERS); if (!peers_nest) goto out; ret = 0; lockdep_assert_held(&wg->device_update_lock); /* If the last cursor was removed in peer_remove or peer_remove_all, then * we just treat this the same as there being no more peers left. The * reason is that seq_nr should indicate to userspace that this isn't a * coherent dump anyway, so they'll try again. */ if (list_empty(&wg->peer_list) || (ctx->next_peer && ctx->next_peer->is_dead)) { nla_nest_cancel(skb, peers_nest); goto out; } peer = list_prepare_entry(ctx->next_peer, &wg->peer_list, peer_list); list_for_each_entry_continue(peer, &wg->peer_list, peer_list) { if (get_peer(peer, skb, ctx)) { done = false; break; } next_peer_cursor = peer; } nla_nest_end(skb, peers_nest); out: if (!ret && !done && next_peer_cursor) wg_peer_get(next_peer_cursor); wg_peer_put(ctx->next_peer); mutex_unlock(&wg->device_update_lock); rtnl_unlock(); if (ret) { genlmsg_cancel(skb, hdr); return ret; } genlmsg_end(skb, hdr); if (done) { ctx->next_peer = NULL; return 0; } ctx->next_peer = next_peer_cursor; return skb->len; /* At this point, we can't really deal ourselves with safely zeroing out * the private key material after usage. This will need an additional API * in the kernel for marking skbs as zero_on_free. */ } static int wg_get_device_done(struct netlink_callback *cb) { struct dump_ctx *ctx = DUMP_CTX(cb); if (ctx->wg) dev_put(ctx->wg->dev); wg_peer_put(ctx->next_peer); return 0; } static int set_port(struct wg_device *wg, u16 port) { struct wg_peer *peer; if (wg->incoming_port == port) return 0; list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); if (!netif_running(wg->dev)) { wg->incoming_port = port; return 0; } return wg_socket_init(wg, port); } static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) { int ret = -EINVAL; u16 family; u8 cidr; if (!attrs[WGALLOWEDIP_A_FAMILY] || !attrs[WGALLOWEDIP_A_IPADDR] || !attrs[WGALLOWEDIP_A_CIDR_MASK]) return ret; family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]); cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]); if (family == AF_INET && cidr <= 32 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) ret = wg_allowedips_insert_v4( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); else if (family == AF_INET6 && cidr <= 128 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) ret = wg_allowedips_insert_v6( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); return ret; } static int set_peer(struct wg_device *wg, struct nlattr **attrs) { u8 *public_key = NULL, *preshared_key = NULL; struct wg_peer *peer = NULL; u32 flags = 0; int ret; ret = -EINVAL; if (attrs[WGPEER_A_PUBLIC_KEY] && nla_len(attrs[WGPEER_A_PUBLIC_KEY]) == NOISE_PUBLIC_KEY_LEN) public_key = nla_data(attrs[WGPEER_A_PUBLIC_KEY]); else goto out; if (attrs[WGPEER_A_PRESHARED_KEY] && nla_len(attrs[WGPEER_A_PRESHARED_KEY]) == NOISE_SYMMETRIC_KEY_LEN) preshared_key = nla_data(attrs[WGPEER_A_PRESHARED_KEY]); if (attrs[WGPEER_A_FLAGS]) flags = nla_get_u32(attrs[WGPEER_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGPEER_F_ALL) goto out; ret = -EPFNOSUPPORT; if (attrs[WGPEER_A_PROTOCOL_VERSION]) { if (nla_get_u32(attrs[WGPEER_A_PROTOCOL_VERSION]) != 1) goto out; } peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, nla_data(attrs[WGPEER_A_PUBLIC_KEY])); ret = 0; if (!peer) { /* Peer doesn't exist yet. Add a new one. */ if (flags & (WGPEER_F_REMOVE_ME | WGPEER_F_UPDATE_ONLY)) goto out; /* The peer is new, so there aren't allowed IPs to remove. */ flags &= ~WGPEER_F_REPLACE_ALLOWEDIPS; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity && !memcmp(nla_data(attrs[WGPEER_A_PUBLIC_KEY]), wg->static_identity.static_public, NOISE_PUBLIC_KEY_LEN)) { /* We silently ignore peers that have the same public * key as the device. The reason we do it silently is * that we'd like for people to be able to reuse the * same set of API calls across peers. */ up_read(&wg->static_identity.lock); ret = 0; goto out; } up_read(&wg->static_identity.lock); peer = wg_peer_create(wg, public_key, preshared_key); if (IS_ERR(peer)) { ret = PTR_ERR(peer); peer = NULL; goto out; } /* Take additional reference, as though we've just been * looked up. */ wg_peer_get(peer); } if (flags & WGPEER_F_REMOVE_ME) { wg_peer_remove(peer); goto out; } if (preshared_key) { down_write(&peer->handshake.lock); memcpy(&peer->handshake.preshared_key, preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_write(&peer->handshake.lock); } if (attrs[WGPEER_A_ENDPOINT]) { struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]); size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]); struct endpoint endpoint = { { { 0 } } }; if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) { endpoint.addr4 = *(struct sockaddr_in *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) { endpoint.addr6 = *(struct sockaddr_in6 *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } } if (flags & WGPEER_F_REPLACE_ALLOWEDIPS) wg_allowedips_remove_by_peer(&wg->peer_allowedips, peer, &wg->device_update_lock); if (attrs[WGPEER_A_ALLOWEDIPS]) { struct nlattr *attr, *allowedip[WGALLOWEDIP_A_MAX + 1]; int rem; nla_for_each_nested(attr, attrs[WGPEER_A_ALLOWEDIPS], rem) { ret = nla_parse_nested(allowedip, WGALLOWEDIP_A_MAX, attr, allowedip_policy, NULL); if (ret < 0) goto out; ret = set_allowedip(peer, allowedip); if (ret < 0) goto out; } } if (attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]) { const u16 persistent_keepalive_interval = nla_get_u16( attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]); const bool send_keepalive = !peer->persistent_keepalive_interval && persistent_keepalive_interval && netif_running(wg->dev); peer->persistent_keepalive_interval = persistent_keepalive_interval; if (send_keepalive) wg_packet_send_keepalive(peer); } if (netif_running(wg->dev)) wg_packet_send_staged_packets(peer); out: wg_peer_put(peer); if (attrs[WGPEER_A_PRESHARED_KEY]) memzero_explicit(nla_data(attrs[WGPEER_A_PRESHARED_KEY]), nla_len(attrs[WGPEER_A_PRESHARED_KEY])); return ret; } static int wg_set_device(struct sk_buff *skb, struct genl_info *info) { struct wg_device *wg = lookup_interface(info->attrs, skb); u32 flags = 0; int ret; if (IS_ERR(wg)) { ret = PTR_ERR(wg); goto out_nodev; } rtnl_lock(); mutex_lock(&wg->device_update_lock); if (info->attrs[WGDEVICE_A_FLAGS]) flags = nla_get_u32(info->attrs[WGDEVICE_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGDEVICE_F_ALL) goto out; if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) { struct net *net; rcu_read_lock(); net = rcu_dereference(wg->creating_net); ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0; rcu_read_unlock(); if (ret) goto out; } ++wg->device_update_gen; if (info->attrs[WGDEVICE_A_FWMARK]) { struct wg_peer *peer; wg->fwmark = nla_get_u32(info->attrs[WGDEVICE_A_FWMARK]); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); } if (info->attrs[WGDEVICE_A_LISTEN_PORT]) { ret = set_port(wg, nla_get_u16(info->attrs[WGDEVICE_A_LISTEN_PORT])); if (ret) goto out; } if (flags & WGDEVICE_F_REPLACE_PEERS) wg_peer_remove_all(wg); if (info->attrs[WGDEVICE_A_PRIVATE_KEY] && nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]) == NOISE_PUBLIC_KEY_LEN) { u8 *private_key = nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]); u8 public_key[NOISE_PUBLIC_KEY_LEN]; struct wg_peer *peer, *temp; bool send_staged_packets; if (!crypto_memneq(wg->static_identity.static_private, private_key, NOISE_PUBLIC_KEY_LEN)) goto skip_set_private_key; /* We remove before setting, to prevent race, which means doing * two 25519-genpub ops. */ if (curve25519_generate_public(public_key, private_key)) { peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, public_key); if (peer) { wg_peer_put(peer); wg_peer_remove(peer); } } down_write(&wg->static_identity.lock); send_staged_packets = !wg->static_identity.has_identity && netif_running(wg->dev); wg_noise_set_static_identity_private_key(&wg->static_identity, private_key); send_staged_packets = send_staged_packets && wg->static_identity.has_identity; wg_cookie_checker_precompute_device_keys(&wg->cookie_checker); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { wg_noise_precompute_static_static(peer); wg_noise_expire_current_peer_keypairs(peer); if (send_staged_packets) wg_packet_send_staged_packets(peer); } up_write(&wg->static_identity.lock); } skip_set_private_key: if (info->attrs[WGDEVICE_A_PEERS]) { struct nlattr *attr, *peer[WGPEER_A_MAX + 1]; int rem; nla_for_each_nested(attr, info->attrs[WGDEVICE_A_PEERS], rem) { ret = nla_parse_nested(peer, WGPEER_A_MAX, attr, peer_policy, NULL); if (ret < 0) goto out; ret = set_peer(wg, peer); if (ret < 0) goto out; } } ret = 0; out: mutex_unlock(&wg->device_update_lock); rtnl_unlock(); dev_put(wg->dev); out_nodev: if (info->attrs[WGDEVICE_A_PRIVATE_KEY]) memzero_explicit(nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]), nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY])); return ret; } static const struct genl_ops genl_ops[] = { { .cmd = WG_CMD_GET_DEVICE, .start = wg_get_device_start, .dumpit = wg_get_device_dump, .done = wg_get_device_done, .flags = GENL_UNS_ADMIN_PERM }, { .cmd = WG_CMD_SET_DEVICE, .doit = wg_set_device, .flags = GENL_UNS_ADMIN_PERM } }; static struct genl_family genl_family __ro_after_init = { .ops = genl_ops, .n_ops = ARRAY_SIZE(genl_ops), .resv_start_op = WG_CMD_SET_DEVICE + 1, .name = WG_GENL_NAME, .version = WG_GENL_VERSION, .maxattr = WGDEVICE_A_MAX, .module = THIS_MODULE, .policy = device_policy, .netnsok = true }; int __init wg_genetlink_init(void) { return genl_register_family(&genl_family); } void __exit wg_genetlink_uninit(void) { genl_unregister_family(&genl_family); }
6 8 3 3 2 2 3 3 3 5 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 1 1 1 1 4 1 4 1 2 5 2 2 3 3 3 1 2 2 3 3 3 3 3 3 2 2 1 1 1 1 1 2 2 2 3 2 1 4 2 2 2 7 2 2 5 5 1 1 2 4 3 3 2 3 3 3 2 2 1 1 1 1 1 3 2 2 3 1 1 1 1 1 7 4 2 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_stats { u64_stats_t rx_bytes; u64_stats_t rx_packets; struct u64_stats_sync syncp; }; /** * struct devlink_trap_policer_item - Packet trap policer attributes. * @policer: Immutable packet trap policer attributes. * @rate: Rate in packets / sec. * @burst: Burst size in packets. * @list: trap_policer_list member. * * Describes packet trap policer attributes. Created by devlink during trap * policer registration. */ struct devlink_trap_policer_item { const struct devlink_trap_policer *policer; u64 rate; u64 burst; struct list_head list; }; /** * struct devlink_trap_group_item - Packet trap group attributes. * @group: Immutable packet trap group attributes. * @policer_item: Associated policer item. Can be NULL. * @list: trap_group_list member. * @stats: Trap group statistics. * * Describes packet trap group attributes. Created by devlink during trap * group registration. */ struct devlink_trap_group_item { const struct devlink_trap_group *group; struct devlink_trap_policer_item *policer_item; struct list_head list; struct devlink_stats __percpu *stats; }; /** * struct devlink_trap_item - Packet trap attributes. * @trap: Immutable packet trap attributes. * @group_item: Associated group item. * @list: trap_list member. * @action: Trap action. * @stats: Trap statistics. * @priv: Driver private information. * * Describes both mutable and immutable packet trap attributes. Created by * devlink during trap registration and used for all trap related operations. */ struct devlink_trap_item { const struct devlink_trap *trap; struct devlink_trap_group_item *group_item; struct list_head list; enum devlink_trap_action action; struct devlink_stats __percpu *stats; void *priv; }; static struct devlink_trap_policer_item * devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (policer_item->policer->id == id) return policer_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (!strcmp(trap_item->trap->name, name)) return trap_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr *attr; if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) return NULL; attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; return devlink_trap_item_lookup(devlink, nla_data(attr)); } static int devlink_trap_action_get_from_info(struct genl_info *info, enum devlink_trap_action *p_trap_action) { u8 val; val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: case DEVLINK_TRAP_ACTION_TRAP: case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: return -EINVAL; } return 0; } static int devlink_trap_metadata_put(struct sk_buff *msg, const struct devlink_trap *trap) { struct nlattr *attr; attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); if (!attr) return -EMSGSIZE; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) goto nla_put_failure; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, struct devlink_stats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct devlink_stats *cpu_stats; u64 rx_packets, rx_bytes; unsigned int start; cpu_stats = per_cpu_ptr(trap_stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rx_packets = u64_stats_read(&cpu_stats->rx_packets); rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); } } static int devlink_trap_group_stats_put(struct sk_buff *msg, struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; devlink_trap_stats_read(trap_stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item) { struct devlink_stats stats; struct nlattr *attr; u64 drops = 0; int err; if (devlink->ops->trap_drop_counter_get) { err = devlink->ops->trap_drop_counter_get(devlink, trap_item->trap, &drops); if (err) return err; } devlink_trap_stats_read(trap_item->stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink->ops->trap_drop_counter_get && devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink_trap_group_item *group_item = trap_item->group_item; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) goto nla_put_failure; if (trap_item->trap->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) goto nla_put_failure; err = devlink_trap_metadata_put(msg, trap_item->trap); if (err) goto nla_put_failure; err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_fill; return genlmsg_reply(msg, info); err_trap_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; int idx = 0; int err = 0; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); } static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { int err; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); return 0; } err = devlink->ops->trap_action_set(devlink, trap_item->trap, trap_action, extack); if (err) return err; trap_item->action = trap_action; return 0; } static int devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, struct genl_info *info) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } return __devlink_trap_action_set(devlink, trap_item, trap_action, info->extack); } int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (!strcmp(group_item->group->name, name)) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (group_item->group->id == id) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_get_from_info(struct devlink *devlink, struct genl_info *info) { char *name; if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) return NULL; name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); return devlink_trap_group_item_lookup(devlink, name); } static int devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (group_item->group->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (group_item->policer_item && nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, group_item->policer_item->policer->id)) goto nla_put_failure; err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_group_fill; return genlmsg_reply(msg, info); err_trap_group_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; int idx = 0; int err = 0; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); } static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { const char *group_name = group_item->group->name; struct devlink_trap_item *trap_item; int err; if (devlink->ops->trap_group_action_set) { err = devlink->ops->trap_group_action_set(devlink, group_item->group, trap_action, extack); if (err) return err; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) continue; trap_item->action = trap_action; } return 0; } list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; err = __devlink_trap_action_set(devlink, trap_item, trap_action, extack); if (err) return err; } return 0; } static int devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info, bool *p_modified) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } err = __devlink_trap_group_action_set(devlink, group_item, trap_action, info->extack); if (err) return err; *p_modified = true; return 0; } static int devlink_trap_group_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return 0; if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (policer_id && !policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; err = devlink->ops->trap_group_set(devlink, group_item->group, policer, extack); if (err) return err; group_item->policer_item = policer_item; return 0; } int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; bool modified = false; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } err = devlink_trap_group_action_set(devlink, group_item, info, &modified); if (err) return err; err = devlink_trap_group_set(devlink, group_item, info); if (err) goto err_trap_group_set; return 0; err_trap_group_set: if (modified) NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); return err; } static struct devlink_trap_policer_item * devlink_trap_policer_item_get_from_info(struct devlink *devlink, struct genl_info *info) { u32 id; if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return NULL; id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); return devlink_trap_policer_item_lookup(devlink, id); } static int devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer *policer) { struct nlattr *attr; u64 drops; int err; if (!devlink->ops->trap_policer_counter_get) return 0; err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); if (err) return err; attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, policer_item->policer->id)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, policer_item->rate)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, policer_item->burst)) goto nla_put_failure; err = devlink_trap_policer_stats_put(msg, devlink, policer_item->policer); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_policer_fill; return genlmsg_reply(msg, info); err_trap_policer_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; int idx = 0; int err = 0; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); } static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct nlattr **attrs = info->attrs; u64 rate, burst; int err; rate = policer_item->rate; burst = policer_item->burst; if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); if (rate < policer_item->policer->min_rate) { NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); return -EINVAL; } if (rate > policer_item->policer->max_rate) { NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); return -EINVAL; } if (burst < policer_item->policer->min_burst) { NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); return -EINVAL; } if (burst > policer_item->policer->max_burst) { NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); return -EINVAL; } err = devlink->ops->trap_policer_set(devlink, policer_item->policer, rate, burst, info->extack); if (err) return err; policer_item->rate = rate; policer_item->burst = burst; return 0; } int devlink_nl_trap_policer_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; if (!devlink->ops->trap_policer_set) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } return devlink_trap_policer_set(devlink, policer_item, info); } #define DEVLINK_TRAP(_id, _type) \ { \ .type = DEVLINK_TRAP_TYPE_##_type, \ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ } static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(SMAC_MC, DROP), DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), DEVLINK_TRAP(EMPTY_TX_LIST, DROP), DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), DEVLINK_TRAP(NON_IP_PACKET, DROP), DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), DEVLINK_TRAP(DIP_LB, DROP), DEVLINK_TRAP(SIP_MC, DROP), DEVLINK_TRAP(SIP_LB, DROP), DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), DEVLINK_TRAP(IPV4_SIP_BC, DROP), DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), DEVLINK_TRAP(MTU_ERROR, EXCEPTION), DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), DEVLINK_TRAP(RPF, EXCEPTION), DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(NON_ROUTABLE, DROP), DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(STP, CONTROL), DEVLINK_TRAP(LACP, CONTROL), DEVLINK_TRAP(LLDP, CONTROL), DEVLINK_TRAP(IGMP_QUERY, CONTROL), DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), DEVLINK_TRAP(MLD_QUERY, CONTROL), DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), DEVLINK_TRAP(IPV4_DHCP, CONTROL), DEVLINK_TRAP(IPV6_DHCP, CONTROL), DEVLINK_TRAP(ARP_REQUEST, CONTROL), DEVLINK_TRAP(ARP_RESPONSE, CONTROL), DEVLINK_TRAP(ARP_OVERLAY, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), DEVLINK_TRAP(IPV4_BFD, CONTROL), DEVLINK_TRAP(IPV6_BFD, CONTROL), DEVLINK_TRAP(IPV4_OSPF, CONTROL), DEVLINK_TRAP(IPV6_OSPF, CONTROL), DEVLINK_TRAP(IPV4_BGP, CONTROL), DEVLINK_TRAP(IPV6_BGP, CONTROL), DEVLINK_TRAP(IPV4_VRRP, CONTROL), DEVLINK_TRAP(IPV6_VRRP, CONTROL), DEVLINK_TRAP(IPV4_PIM, CONTROL), DEVLINK_TRAP(IPV6_PIM, CONTROL), DEVLINK_TRAP(UC_LB, CONTROL), DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), DEVLINK_TRAP(EARLY_DROP, DROP), DEVLINK_TRAP(VXLAN_PARSING, DROP), DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), DEVLINK_TRAP(VLAN_PARSING, DROP), DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), DEVLINK_TRAP(MPLS_PARSING, DROP), DEVLINK_TRAP(ARP_PARSING, DROP), DEVLINK_TRAP(IP_1_PARSING, DROP), DEVLINK_TRAP(IP_N_PARSING, DROP), DEVLINK_TRAP(GRE_PARSING, DROP), DEVLINK_TRAP(UDP_PARSING, DROP), DEVLINK_TRAP(TCP_PARSING, DROP), DEVLINK_TRAP(IPSEC_PARSING, DROP), DEVLINK_TRAP(SCTP_PARSING, DROP), DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), DEVLINK_TRAP(DMAC_FILTER, DROP), DEVLINK_TRAP(EAPOL, CONTROL), DEVLINK_TRAP(LOCKED_PORT, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ { \ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ } static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), DEVLINK_TRAP_GROUP(STP), DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), DEVLINK_TRAP_GROUP(DHCP), DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), DEVLINK_TRAP_GROUP(BFD), DEVLINK_TRAP_GROUP(OSPF), DEVLINK_TRAP_GROUP(BGP), DEVLINK_TRAP_GROUP(VRRP), DEVLINK_TRAP_GROUP(PIM), DEVLINK_TRAP_GROUP(UC_LB), DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), DEVLINK_TRAP_GROUP(ACL_SAMPLE), DEVLINK_TRAP_GROUP(ACL_TRAP), DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), DEVLINK_TRAP_GROUP(EAPOL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) { if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) return -EINVAL; if (trap->type != devlink_trap_generic[trap->id].type) return -EINVAL; return 0; } static int devlink_trap_driver_verify(const struct devlink_trap *trap) { int i; if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { if (!strcmp(trap->name, devlink_trap_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_verify(const struct devlink_trap *trap) { if (!trap || !trap->name) return -EINVAL; if (trap->generic) return devlink_trap_generic_verify(trap); else return devlink_trap_driver_verify(trap); } static int devlink_trap_group_generic_verify(const struct devlink_trap_group *group) { if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) return -EINVAL; return 0; } static int devlink_trap_group_driver_verify(const struct devlink_trap_group *group) { int i; if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { if (!strcmp(group->name, devlink_trap_group_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_group_verify(const struct devlink_trap_group *group) { if (group->generic) return devlink_trap_group_generic_verify(group); else return devlink_trap_group_driver_verify(group); } static void devlink_trap_group_notify(struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_groups_notify_register(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); } void devlink_trap_groups_notify_unregister(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); } static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) { u16 group_id = trap_item->trap->init_group_id; struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); if (WARN_ON_ONCE(!group_item)) return -EINVAL; trap_item->group_item = group_item; return 0; } static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_traps_notify_register(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); } void devlink_traps_notify_unregister(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); } static int devlink_trap_register(struct devlink *devlink, const struct devlink_trap *trap, void *priv) { struct devlink_trap_item *trap_item; int err; if (devlink_trap_item_lookup(devlink, trap->name)) return -EEXIST; trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); if (!trap_item) return -ENOMEM; trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!trap_item->stats) { err = -ENOMEM; goto err_stats_alloc; } trap_item->trap = trap; trap_item->action = trap->init_action; trap_item->priv = priv; err = devlink_trap_item_group_link(devlink, trap_item); if (err) goto err_group_link; err = devlink->ops->trap_init(devlink, trap, trap_item); if (err) goto err_trap_init; list_add_tail(&trap_item->list, &devlink->trap_list); devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); return 0; err_trap_init: err_group_link: free_percpu(trap_item->stats); err_stats_alloc: kfree(trap_item); return err; } static void devlink_trap_unregister(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); list_del(&trap_item->list); if (devlink->ops->trap_fini) devlink->ops->trap_fini(devlink, trap, trap_item); free_percpu(trap_item->stats); kfree(trap_item); } static void devlink_trap_disable(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, NULL); trap_item->action = DEVLINK_TRAP_ACTION_DROP; } /** * devl_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Return: Non-zero value on failure. */ int devl_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int i, err; if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) return -EINVAL; devl_assert_locked(devlink); for (i = 0; i < traps_count; i++) { const struct devlink_trap *trap = &traps[i]; err = devlink_trap_verify(trap); if (err) goto err_trap_verify; err = devlink_trap_register(devlink, trap, priv); if (err) goto err_trap_register; } return 0; err_trap_register: err_trap_verify: for (i--; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); return err; } EXPORT_SYMBOL_GPL(devl_traps_register); /** * devlink_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int err; devl_lock(devlink); err = devl_traps_register(devlink, traps, traps_count, priv); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_traps_register); /** * devl_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. */ void devl_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { int i; devl_assert_locked(devlink); /* Make sure we do not have any packets in-flight while unregistering * traps by disabling all of them and waiting for a grace period. */ for (i = traps_count - 1; i >= 0; i--) devlink_trap_disable(devlink, &traps[i]); synchronize_rcu(); for (i = traps_count - 1; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); } EXPORT_SYMBOL_GPL(devl_traps_unregister); /** * devlink_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { devl_lock(devlink); devl_traps_unregister(devlink, traps, traps_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_traps_unregister); static void devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, size_t skb_len) { struct devlink_stats *stats; stats = this_cpu_ptr(trap_stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb_len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } static void devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, const struct devlink_trap_item *trap_item, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { metadata->trap_name = trap_item->trap->name; metadata->trap_group_name = trap_item->group_item->group->name; metadata->fa_cookie = fa_cookie; metadata->trap_type = trap_item->trap->type; spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) metadata->input_dev = in_devlink_port->type_eth.netdev; spin_unlock(&in_devlink_port->type_lock); } /** * devlink_trap_report - Report trapped packet to drop monitor. * @devlink: devlink. * @skb: Trapped packet. * @trap_ctx: Trap context. * @in_devlink_port: Input devlink port. * @fa_cookie: Flow action cookie. Could be NULL. */ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, void *trap_ctx, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { struct devlink_trap_item *trap_item = trap_ctx; devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); if (tracepoint_enabled(devlink_trap_report)) { struct devlink_trap_metadata metadata = {}; devlink_trap_report_metadata_set(&metadata, trap_item, in_devlink_port, fa_cookie); trace_devlink_trap_report(devlink, skb, &metadata); } } EXPORT_SYMBOL_GPL(devlink_trap_report); /** * devlink_trap_ctx_priv - Trap context to driver private information. * @trap_ctx: Trap context. * * Return: Driver private information passed during registration. */ void *devlink_trap_ctx_priv(void *trap_ctx) { struct devlink_trap_item *trap_item = trap_ctx; return trap_item->priv; } EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); static int devlink_trap_group_item_policer_link(struct devlink *devlink, struct devlink_trap_group_item *group_item) { u32 policer_id = group_item->group->init_policer_id; struct devlink_trap_policer_item *policer_item; if (policer_id == 0) return 0; policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (WARN_ON_ONCE(!policer_item)) return -EINVAL; group_item->policer_item = policer_item; return 0; } static int devlink_trap_group_register(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; int err; if (devlink_trap_group_item_lookup(devlink, group->name)) return -EEXIST; group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); if (!group_item) return -ENOMEM; group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!group_item->stats) { err = -ENOMEM; goto err_stats_alloc; } group_item->group = group; err = devlink_trap_group_item_policer_link(devlink, group_item); if (err) goto err_policer_link; if (devlink->ops->trap_group_init) { err = devlink->ops->trap_group_init(devlink, group); if (err) goto err_group_init; } list_add_tail(&group_item->list, &devlink->trap_group_list); devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); return 0; err_group_init: err_policer_link: free_percpu(group_item->stats); err_stats_alloc: kfree(group_item); return err; } static void devlink_trap_group_unregister(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup(devlink, group->name); if (WARN_ON_ONCE(!group_item)) return; devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); list_del(&group_item->list); free_percpu(group_item->stats); kfree(group_item); } /** * devl_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Return: Non-zero value on failure. */ int devl_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < groups_count; i++) { const struct devlink_trap_group *group = &groups[i]; err = devlink_trap_group_verify(group); if (err) goto err_trap_group_verify; err = devlink_trap_group_register(devlink, group); if (err) goto err_trap_group_register; } return 0; err_trap_group_register: err_trap_group_verify: for (i--; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_groups_register); /** * devlink_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int err; devl_lock(devlink); err = devl_trap_groups_register(devlink, groups, groups_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_trap_groups_register); /** * devl_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. */ void devl_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i; devl_assert_locked(devlink); for (i = groups_count - 1; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); } EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); /** * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { devl_lock(devlink); devl_trap_groups_unregister(devlink, groups, groups_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_policers_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); } void devlink_trap_policers_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); } static int devlink_trap_policer_register(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; int err; if (devlink_trap_policer_item_lookup(devlink, policer->id)) return -EEXIST; policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); if (!policer_item) return -ENOMEM; policer_item->policer = policer; policer_item->rate = policer->init_rate; policer_item->burst = policer->init_burst; if (devlink->ops->trap_policer_init) { err = devlink->ops->trap_policer_init(devlink, policer); if (err) goto err_policer_init; } list_add_tail(&policer_item->list, &devlink->trap_policer_list); devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); return 0; err_policer_init: kfree(policer_item); return err; } static void devlink_trap_policer_unregister(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); if (WARN_ON_ONCE(!policer_item)) return; devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); list_del(&policer_item->list); if (devlink->ops->trap_policer_fini) devlink->ops->trap_policer_fini(devlink, policer); kfree(policer_item); } /** * devl_trap_policers_register - Register packet trap policers with devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. * * Return: Non-zero value on failure. */ int devl_trap_policers_register(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < policers_count; i++) { const struct devlink_trap_policer *policer = &policers[i]; if (WARN_ON(policer->id == 0 || policer->max_rate < policer->min_rate || policer->max_burst < policer->min_burst)) { err = -EINVAL; goto err_trap_policer_verify; } err = devlink_trap_policer_register(devlink, policer); if (err) goto err_trap_policer_register; } return 0; err_trap_policer_register: err_trap_policer_verify: for (i--; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_policers_register); /** * devl_trap_policers_unregister - Unregister packet trap policers from devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. */ void devl_trap_policers_unregister(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i; devl_assert_locked(devlink); for (i = policers_count - 1; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 // SPDX-License-Identifier: GPL-2.0 /* * linux/net/sunrpc/xprtsock.c * * Client-side transport implementation for sockets. * * TCP callback races fixes (C) 1998 Red Hat * TCP send fixes (C) 1998 Red Hat * TCP NFS related read + write fixes * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> * * Rewrite of larges part of the code in order to stabilize TCP stuff. * Fix behaviour when socket buffer is full. * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> * * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com> * * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005. * <gilles.quillard@bull.net> */ #include <linux/types.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/pagemap.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/un.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprtsock.h> #include <linux/file.h> #ifdef CONFIG_SUNRPC_BACKCHANNEL #include <linux/sunrpc/bc_xprt.h> #endif #include <net/sock.h> #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> #include <net/tls_prot.h> #include <net/handshake.h> #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uio.h> #include <linux/sched/mm.h> #include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); static void xs_reset_srcport(struct sock_xprt *transport); static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); /* * xprtsock tunables */ static unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; static unsigned int xprt_tcp_slot_table_entries = RPC_MIN_SLOT_TABLE; static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; #define XS_TCP_LINGER_TO (15U * HZ) static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; /* * We can register our own files under /proc/sys/sunrpc by * calling register_sysctl() again. The files in that * directory become the union of all files registered there. * * We simply need to make sure that we don't collide with * someone else's file names! */ static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static struct ctl_table_header *sunrpc_table_header; static struct xprt_class xs_local_transport; static struct xprt_class xs_udp_transport; static struct xprt_class xs_tcp_transport; static struct xprt_class xs_tcp_tls_transport; static struct xprt_class xs_bc_tcp_transport; /* * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports */ static struct ctl_table xs_tunables_table[] = { { .procname = "udp_slot_table_entries", .data = &xprt_udp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, { .procname = "tcp_slot_table_entries", .data = &xprt_tcp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, { .procname = "tcp_max_slot_table_entries", .data = &xprt_max_tcp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_tcp_slot_table_limit }, { .procname = "min_resvport", .data = &xprt_min_resvport, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", .data = &xprt_max_resvport, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { .procname = "tcp_fin_timeout", .data = &xs_tcp_fin_timeout, .maxlen = sizeof(xs_tcp_fin_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* * Wait duration for a reply from the RPC portmapper. */ #define XS_BIND_TO (60U * HZ) /* * Delay if a UDP socket connect error occurs. This is most likely some * kind of resource problem on the local host. */ #define XS_UDP_REEST_TO (2U * HZ) /* * The reestablish timeout allows clients to delay for a bit before attempting * to reconnect to a server that just dropped our connection. * * We implement an exponential backoff when trying to reestablish a TCP * transport connection with the server. Some servers like to drop a TCP * connection when they are overworked, so we start with a short timeout and * increase over time if the server is down or not responding. */ #define XS_TCP_INIT_REEST_TO (3U * HZ) /* * TCP idle timeout; client drops the transport socket if it is idle * for this long. Note that we also timeout UDP sockets to prevent * holding port numbers when there is no RPC traffic. */ #define XS_IDLE_DISC_TO (5U * 60 * HZ) /* * TLS handshake timeout. */ #define XS_TLS_HANDSHAKE_TO (10U * HZ) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS #endif #ifdef RPC_DEBUG_DATA static void xs_pktdump(char *msg, u32 *packet, unsigned int count) { u8 *buf = (u8 *) packet; int j; dprintk("RPC: %s\n", msg); for (j = 0; j < count && j < 128; j += 4) { if (!(j & 31)) { if (j) dprintk("\n"); dprintk("0x%04x ", j); } dprintk("%02x%02x%02x%02x ", buf[j], buf[j+1], buf[j+2], buf[j+3]); } dprintk("\n"); } #else static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) { /* NOP */ } #endif static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) { return (struct rpc_xprt *) sk->sk_user_data; } static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) { return (struct sockaddr *) &xprt->addr; } static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt) { return (struct sockaddr_un *) &xprt->addr; } static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) { return (struct sockaddr_in *) &xprt->addr; } static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt) { return (struct sockaddr_in6 *) &xprt->addr; } static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = xs_addr(xprt); struct sockaddr_in6 *sin6; struct sockaddr_in *sin; struct sockaddr_un *sun; char buf[128]; switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); if (sun->sun_path[0]) { strscpy(buf, sun->sun_path, sizeof(buf)); } else { buf[0] = '@'; strscpy(buf+1, sun->sun_path+1, sizeof(buf)-1); } xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; case AF_INET: (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); sin = xs_addr_in(xprt); snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); break; case AF_INET6: (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); sin6 = xs_addr_in6(xprt); snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); break; default: BUG(); } xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); } static void xs_format_common_peer_ports(struct rpc_xprt *xprt) { struct sockaddr *sap = xs_addr(xprt); char buf[128]; snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); } static void xs_format_peer_addresses(struct rpc_xprt *xprt, const char *protocol, const char *netid) { xprt->address_strings[RPC_DISPLAY_PROTO] = protocol; xprt->address_strings[RPC_DISPLAY_NETID] = netid; xs_format_common_peer_addresses(xprt); xs_format_common_peer_ports(xprt); } static void xs_update_peer_port(struct rpc_xprt *xprt) { kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); kfree(xprt->address_strings[RPC_DISPLAY_PORT]); xs_format_common_peer_ports(xprt); } static void xs_free_peer_addresses(struct rpc_xprt *xprt) { unsigned int i; for (i = 0; i < RPC_DISPLAY_MAX; i++) switch (i) { case RPC_DISPLAY_PROTO: case RPC_DISPLAY_NETID: continue; default: kfree(xprt->address_strings[i]); } } static size_t xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) { size_t i,n; if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES)) return want; n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < n; i++) { if (buf->pages[i]) continue; buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); if (!buf->pages[i]) { i *= PAGE_SIZE; return i > buf->page_base ? i - buf->page_base : 0; } } return want; } static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; switch (content_type) { case 0: break; case TLS_RECORD_TYPE_DATA: /* TLS sets EOR at the end of each application data * record, even though there might be more frames * waiting to be decrypted. */ msg->msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); ret = (level == TLS_ALERT_LEVEL_FATAL) ? -EACCES : -EAGAIN; break; default: /* discard this record type */ ret = -EAGAIN; } return ret; } static int xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; int ret; msg->msg_control = &u; msg->msg_controllen = sizeof(u); ret = sock_recvmsg(sock, msg, flags); if (msg->msg_controllen != sizeof(u)) ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret); return ret; } static ssize_t xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) { ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); ret = xs_sock_recv_cmsg(sock, msg, flags); return ret > 0 ? ret + seek : ret; } static ssize_t xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, struct kvec *kvec, size_t count, size_t seek) { iov_iter_kvec(&msg->msg_iter, ITER_DEST, kvec, 1, count); return xs_sock_recvmsg(sock, msg, flags, seek); } static ssize_t xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, struct bio_vec *bvec, unsigned long nr, size_t count, size_t seek) { iov_iter_bvec(&msg->msg_iter, ITER_DEST, bvec, nr, count); return xs_sock_recvmsg(sock, msg, flags, seek); } static ssize_t xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); return xs_sock_recv_cmsg(sock, msg, flags); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static void xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) { struct bvec_iter bi = { .bi_size = count, }; struct bio_vec bv; bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); for_each_bvec(bv, bvec, bi, bi) flush_dcache_page(bv.bv_page); } #else static inline void xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) { } #endif static ssize_t xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, struct xdr_buf *buf, size_t count, size_t seek, size_t *read) { size_t want, seek_init = seek, offset = 0; ssize_t ret; want = min_t(size_t, count, buf->head[0].iov_len); if (seek < want) { ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); if (ret <= 0) goto sock_err; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; seek = 0; } else { seek -= want; offset += want; } want = xs_alloc_sparse_pages( buf, min_t(size_t, count - offset, buf->page_len), GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (seek < want) { ret = xs_read_bvec(sock, msg, flags, buf->bvec, xdr_buf_pagecount(buf), want + buf->page_base, seek + buf->page_base); if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); ret -= buf->page_base; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; seek = 0; } else { seek -= want; offset += want; } want = min_t(size_t, count - offset, buf->tail[0].iov_len); if (seek < want) { ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); if (ret <= 0) goto sock_err; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; } else if (offset < seek_init) offset = seek_init; ret = -EMSGSIZE; out: *read = offset - seek_init; return ret; sock_err: offset += seek; goto out; } static void xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) { if (!transport->recv.copied) { if (buf->head[0].iov_len >= transport->recv.offset) memcpy(buf->head[0].iov_base, &transport->recv.xid, transport->recv.offset); transport->recv.copied = transport->recv.offset; } } static bool xs_read_stream_request_done(struct sock_xprt *transport) { return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); } static void xs_read_stream_check_eor(struct sock_xprt *transport, struct msghdr *msg) { if (xs_read_stream_request_done(transport)) msg->msg_flags |= MSG_EOR; } static ssize_t xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; size_t want, read; ssize_t ret; xs_read_header(transport, buf); want = transport->recv.len - transport->recv.offset; if (want != 0) { ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, transport->recv.copied + want, transport->recv.copied, &read); transport->recv.offset += read; transport->recv.copied += read; } if (transport->recv.offset == transport->recv.len) xs_read_stream_check_eor(transport, msg); if (want == 0) return 0; switch (ret) { default: break; case -EFAULT: case -EMSGSIZE: msg->msg_flags |= MSG_TRUNC; return read; case 0: return -ESHUTDOWN; } return ret < 0 ? ret : read; } static size_t xs_read_stream_headersize(bool isfrag) { if (isfrag) return sizeof(__be32); return 3 * sizeof(__be32); } static ssize_t xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, int flags, size_t want, size_t seek) { struct kvec kvec = { .iov_base = &transport->recv.fraghdr, .iov_len = want, }; return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static ssize_t xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) { struct rpc_xprt *xprt = &transport->xprt; struct rpc_rqst *req; ssize_t ret; /* Is this transport associated with the backchannel? */ if (!xprt->bc_serv) return -ESHUTDOWN; /* Look up and lock the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->recv.xid); if (!req) { printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } if (transport->recv.copied && !req->rq_private_buf.len) return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); else req->rq_private_buf.len = transport->recv.copied; return ret; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ static ssize_t xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) { return -ESHUTDOWN; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ static ssize_t xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) { struct rpc_xprt *xprt = &transport->xprt; struct rpc_rqst *req; ssize_t ret = 0; /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } xprt_pin_rqst(req); spin_unlock(&xprt->queue_lock); ret = xs_read_stream_request(transport, msg, flags, req); spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); else req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); return ret; } static ssize_t xs_read_stream(struct sock_xprt *transport, int flags) { struct msghdr msg = { 0 }; size_t want, read = 0; ssize_t ret = 0; if (transport->recv.len == 0) { want = xs_read_stream_headersize(transport->recv.copied != 0); ret = xs_read_stream_header(transport, &msg, flags, want, transport->recv.offset); if (ret <= 0) goto out_err; transport->recv.offset = ret; if (transport->recv.offset != want) return transport->recv.offset; transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & RPC_FRAGMENT_SIZE_MASK; transport->recv.offset -= sizeof(transport->recv.fraghdr); read = ret; } switch (be32_to_cpu(transport->recv.calldir)) { default: msg.msg_flags |= MSG_TRUNC; break; case RPC_CALL: ret = xs_read_stream_call(transport, &msg, flags); break; case RPC_REPLY: ret = xs_read_stream_reply(transport, &msg, flags); } if (msg.msg_flags & MSG_TRUNC) { transport->recv.calldir = cpu_to_be32(-1); transport->recv.copied = -1; } if (ret < 0) goto out_err; read += ret; if (transport->recv.offset < transport->recv.len) { if (!(msg.msg_flags & MSG_TRUNC)) return read; msg.msg_flags = 0; ret = xs_read_discard(transport->sock, &msg, flags, transport->recv.len - transport->recv.offset); if (ret <= 0) goto out_err; transport->recv.offset += ret; read += ret; if (transport->recv.offset != transport->recv.len) return read; } if (xs_read_stream_request_done(transport)) { trace_xs_stream_read_request(transport); transport->recv.copied = 0; } transport->recv.offset = 0; transport->recv.len = 0; return read; out_err: return ret != 0 ? ret : -ESHUTDOWN; } static __poll_t xs_poll_socket(struct sock_xprt *transport) { return transport->sock->ops->poll(transport->file, transport->sock, NULL); } static bool xs_poll_socket_readable(struct sock_xprt *transport) { __poll_t events = xs_poll_socket(transport); return (events & (EPOLLIN | EPOLLRDNORM)) && !(events & EPOLLRDHUP); } static void xs_poll_check_readable(struct sock_xprt *transport) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) return; if (!xs_poll_socket_readable(transport)) return; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } static void xs_stream_data_receive(struct sock_xprt *transport) { size_t read = 0; ssize_t ret = 0; mutex_lock(&transport->recv_mutex); if (transport->sock == NULL) goto out; for (;;) { ret = xs_read_stream(transport, MSG_DONTWAIT); if (ret < 0) break; read += ret; cond_resched(); } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); else if (ret == -EACCES) xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: mutex_unlock(&transport->recv_mutex); trace_xs_stream_read_data(&transport->xprt, ret, read); } static void xs_stream_data_receive_workfn(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, recv_worker); unsigned int pflags = memalloc_nofs_save(); xs_stream_data_receive(transport); memalloc_nofs_restore(pflags); } static void xs_stream_reset_connect(struct sock_xprt *transport) { transport->recv.offset = 0; transport->recv.len = 0; transport->recv.copied = 0; transport->xmit.offset = 0; } static void xs_stream_start_connect(struct sock_xprt *transport) { transport->xprt.stat.connect_count++; transport->xprt.stat.connect_start = jiffies; } #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) /** * xs_nospace - handle transmit was incomplete * @req: pointer to RPC request * @transport: pointer to struct sock_xprt * */ static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport) { struct rpc_xprt *xprt = &transport->xprt; struct sock *sk = transport->inet; int ret = -EAGAIN; trace_rpc_socket_nospace(req, transport); /* Protect against races with write_space */ spin_lock(&xprt->transport_lock); /* Don't race with disconnect */ if (xprt_connected(xprt)) { /* wait for more buffer space */ set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; spin_unlock(&xprt->transport_lock); return ret; } static int xs_sock_nospace(struct rpc_rqst *req) { struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; lock_sock(sk); if (!sock_writeable(sk)) ret = xs_nospace(req, transport); release_sock(sk); return ret; } static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait) { struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; if (vm_wait) return -ENOBUFS; lock_sock(sk); if (!sk_stream_memory_free(sk)) ret = xs_nospace(req, transport); release_sock(sk); return ret; } static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) { return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } static void xs_stream_abort_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (transport->xmit.offset != 0 && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) xprt_force_disconnect(xprt); } /* * Determine if the previous message in the stream was aborted before it * could complete transmission. */ static bool xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) { return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; } /* * Return the stream record marker field for a record of length < 2^31-1 */ static rpc_fraghdr xs_stream_record_marker(struct xdr_buf *xdr) { if (!xdr->len) return 0; return cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); } /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ static int xs_local_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; struct msghdr msg = { .msg_flags = XS_SENDMSG_FLAGS, }; bool vm_wait; unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { xprt_force_disconnect(xprt); return -ENOTCONN; } xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; req->rq_xtime = ktime_get(); status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - transport->xmit.offset, status); if (likely(sent > 0) || status == 0) { transport->xmit.offset += sent; req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; return 0; } status = -EAGAIN; vm_wait = false; } switch (status) { case -EAGAIN: status = xs_stream_nospace(req, vm_wait); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); fallthrough; case -EPIPE: xprt_force_disconnect(xprt); status = -ENOTCONN; } return status; } /** * xs_udp_send_request - write an RPC request to a UDP socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ static int xs_udp_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; struct msghdr msg = { .msg_name = xs_addr(xprt), .msg_namelen = xprt->addrlen, .msg_flags = XS_SENDMSG_FLAGS, }; unsigned int sent; int status; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); if (!xprt_bound(xprt)) return -ENOTCONN; if (!xprt_request_get_cong(xprt, req)) return -EBADSLT; status = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); if (status < 0) return status; req->rq_xtime = ktime_get(); status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) goto process_status; if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (sent > 0 || status == 0) { req->rq_xmit_bytes_sent += sent; if (sent >= req->rq_slen) return 0; /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } process_status: switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; case -EAGAIN: status = xs_sock_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: case -EPIPE: case -ECONNREFUSED: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); } return status; } /** * xs_tcp_send_request - write an RPC request to a TCP socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent * * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ static int xs_tcp_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; struct msghdr msg = { .msg_flags = XS_SENDMSG_FLAGS, }; bool vm_wait; unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { if (transport->sock != NULL) kernel_sock_shutdown(transport->sock, SHUT_RDWR); return -ENOTCONN; } if (!transport->inet) return -ENOTCONN; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) xs_tcp_set_socket_timeouts(xprt, transport->sock); xs_set_srcport(transport, transport->sock); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ req->rq_xtime = ktime_get(); tcp_sock_set_cork(transport->inet, true); vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; do { status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ transport->xmit.offset += sent; req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; if (atomic_long_read(&xprt->xmit_queuelen) == 1) tcp_sock_set_cork(transport->inet, false); return 0; } WARN_ON_ONCE(sent == 0 && status == 0); if (sent > 0) vm_wait = false; } while (status == 0); switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; case -EAGAIN: status = xs_stream_nospace(req, vm_wait); break; case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); } return status; } static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) { sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; sk->sk_error_report = transport->old_error_report; } static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = 0; clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) { set_bit(nr, &transport->sock_state); queue_work(xprtiod_workqueue, &transport->error_worker); } static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); xs_sock_reset_state_flags(xprt); smp_mb__after_atomic(); } /** * xs_error_report - callback to handle TCP socket state errors * @sk: socket * * Note: we don't call sock_error() since there may be a rpc_task * using the socket, and so we don't want to clear sk->sk_err. */ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; if (!(xprt = xprt_from_sock(sk))) return; transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = -sk->sk_err; if (transport->xprt_err == 0) return; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -transport->xprt_err); trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); } static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; struct sock *sk = transport->inet; struct rpc_xprt *xprt = &transport->xprt; struct file *filp = transport->file; if (sk == NULL) return; /* * Make sure we're calling this in a context from which it is safe * to call __fput_sync(). In practice that means rpciod and the * system workqueue. */ if (!(current->flags & PF_WQ_WORKER)) { WARN_ON_ONCE(1); set_bit(XPRT_CLOSE_WAIT, &xprt->state); return; } if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); tls_handshake_cancel(sk); kernel_sock_shutdown(sock, SHUT_RDWR); mutex_lock(&transport->recv_mutex); lock_sock(sk); transport->inet = NULL; transport->sock = NULL; transport->file = NULL; sk->sk_user_data = NULL; sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); xs_sock_reset_connection_flags(xprt); /* Reset stream record info */ xs_stream_reset_connect(transport); release_sock(sk); mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); __fput_sync(filp); xprt_disconnect_done(xprt); } /** * xs_close - close a socket * @xprt: transport * * This is used when all requests are complete; ie, no DRC state remains * on the server we want to save. * * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with * xs_reset_transport() zeroing the socket from underneath a writer. */ static void xs_close(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); dprintk("RPC: xs_close xprt %p\n", xprt); if (transport->sock) tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } static void xs_inject_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: injecting transport disconnect on xprt=%p\n", xprt); xprt_disconnect_done(xprt); } static void xs_xprt_free(struct rpc_xprt *xprt) { xs_free_peer_addresses(xprt); xprt_free(xprt); } /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport * */ static void xs_destroy(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); dprintk("RPC: xs_destroy xprt %p\n", xprt); cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); cancel_work_sync(&transport->recv_worker); cancel_work_sync(&transport->error_worker); xs_xprt_free(xprt); module_put(THIS_MODULE); } /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport * @sk: socket * @skb: skbuff * */ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, struct sock *sk, struct sk_buff *skb) { struct rpc_task *task; struct rpc_rqst *rovr; int repsize, copied; u32 _xid; __be32 *xp; repsize = skb->len; if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, 0, sizeof(_xid), &_xid); if (xp == NULL) return; /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } spin_lock(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) { struct sk_buff *skb; struct sock *sk; int err; mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; for (;;) { skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (skb == NULL) break; xs_udp_data_read_skb(&transport->xprt, sk, skb); consume_skb(skb); cond_resched(); } xs_poll_check_readable(transport); out: mutex_unlock(&transport->recv_mutex); } static void xs_udp_data_receive_workfn(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, recv_worker); unsigned int pflags = memalloc_nofs_save(); xs_udp_data_receive(transport); memalloc_nofs_restore(pflags); } /** * xs_data_ready - "data ready" callback for sockets * @sk: socket with data to read * */ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; trace_sk_data_ready(sk); xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); trace_xs_data_ready(xprt); transport->old_data_ready(sk); if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) return; /* Any data means we had a useful conversation, so * then we don't need to delay the next reconnect */ if (xprt->reestablish_timeout) xprt->reestablish_timeout = 0; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } } /* * Helper function to force a TCP close if the server is sending * junk and/or it has put us in CLOSE_WAIT */ static void xs_tcp_force_close(struct rpc_xprt *xprt) { xprt_force_disconnect(xprt); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * xs_local_state_change - callback to handle AF_LOCAL socket state changes * @sk: socket whose state has changed * */ static void xs_local_state_change(struct sock *sk) { struct rpc_xprt *xprt; struct sock_xprt *transport; if (!(xprt = xprt_from_sock(sk))) return; transport = container_of(xprt, struct sock_xprt, xprt); if (sk->sk_shutdown & SHUTDOWN_MASK) { clear_bit(XPRT_CONNECTED, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } } /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * */ static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; struct sock_xprt *transport; if (!(xprt = xprt_from_sock(sk))) return; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", sk->sk_state, xprt_connected(xprt), sock_flag(sk, SOCK_DEAD), sock_flag(sk, SOCK_ZAPPED), sk->sk_shutdown); transport = container_of(xprt, struct sock_xprt, xprt); trace_rpc_socket_state_change(xprt, sk->sk_socket); switch (sk->sk_state) { case TCP_ESTABLISHED: if (!xprt_test_and_set_connected(xprt)) { xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING); } break; case TCP_FIN_WAIT1: /* The client initiated a shutdown of the socket */ xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_atomic(); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); fallthrough; case TCP_CLOSING: /* * If the server closed down the connection, make sure that * we back off before reconnecting */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_atomic(); break; case TCP_CLOSE: if (test_and_clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state)) { xs_reset_srcport(transport); xprt_clear_connecting(xprt); } clear_bit(XPRT_CLOSING, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } } static void xs_write_space(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; if (!sk->sk_socket) return; clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; transport = container_of(xprt, struct sock_xprt, xprt); if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state)) return; xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE); sk->sk_write_pending--; } /** * xs_udp_write_space - callback invoked when socket buffer space * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ static void xs_udp_write_space(struct sock *sk) { /* from net/core/sock.c:sock_def_write_space */ if (sock_writeable(sk)) xs_write_space(sk); } /** * xs_tcp_write_space - callback invoked when socket buffer space * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ static void xs_tcp_write_space(struct sock *sk) { /* from net/core/stream.c:sk_stream_write_space */ if (sk_stream_is_writeable(sk)) xs_write_space(sk); } static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; if (transport->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2; } if (transport->sndsize) { sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2; sk->sk_write_space(sk); } } /** * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport * @sndsize: requested size of send buffer, in bytes * @rcvsize: requested size of receive buffer, in bytes * * Set socket send and receive buffer size limits. */ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); transport->sndsize = 0; if (sndsize) transport->sndsize = sndsize + 1024; transport->rcvsize = 0; if (rcvsize) transport->rcvsize = rcvsize + 1024; xs_udp_do_set_buffer_size(xprt); } /** * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport * @xprt: controlling transport * @task: task that timed out * * Adjust the congestion window after a retransmit timeout has occurred. */ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) { spin_lock(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); spin_unlock(&xprt->transport_lock); } static int xs_get_random_port(void) { unsigned short min = xprt_min_resvport, max = xprt_max_resvport; unsigned short range; unsigned short rand; if (max < min) return -EADDRINUSE; range = max - min + 1; rand = get_random_u32_below(range); return rand + min; } static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; unsigned short port = 0; if (kernel_getsockname(sock, (struct sockaddr *)&buf) < 0) goto out; switch (buf.ss_family) { case AF_INET6: port = ntohs(((struct sockaddr_in6 *)&buf)->sin6_port); break; case AF_INET: port = ntohs(((struct sockaddr_in *)&buf)->sin_port); } out: return port; } /** * xs_set_port - reset the port number in the remote endpoint address * @xprt: generic transport * @port: new port number * */ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) { dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); rpc_set_port(xs_addr(xprt), port); xs_update_peer_port(xprt); } static void xs_reset_srcport(struct sock_xprt *transport) { transport->srcport = 0; } static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) { if (transport->srcport == 0 && transport->xprt.reuseport) transport->srcport = xs_sock_getport(sock); } static int xs_get_srcport(struct sock_xprt *transport) { int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); return port; } static unsigned short xs_sock_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); unsigned short ret = 0; mutex_lock(&sock->recv_mutex); if (sock->sock) ret = xs_sock_getport(sock->sock); mutex_unlock(&sock->recv_mutex); return ret; } static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); union { struct sockaddr sa; struct sockaddr_storage st; } saddr; int ret = -ENOTCONN; mutex_lock(&sock->recv_mutex); if (sock->sock) { ret = kernel_getsockname(sock->sock, &saddr.sa); if (ret >= 0) ret = snprintf(buf, buflen, "%pISc", &saddr.sa); } mutex_unlock(&sock->recv_mutex); return ret; } static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) transport->srcport = 0; if (!transport->xprt.resvport) return 0; if (port <= xprt_min_resvport || port > xprt_max_resvport) return xprt_max_resvport; return --port; } static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; int port = xs_get_srcport(transport); unsigned short last; /* * If we are asking for any ephemeral port (i.e. port == 0 && * transport->xprt.resvport == 0), don't bind. Let the local * port selection happen implicitly when the socket is used * (for example at connect time). * * This ensures that we can continue to establish TCP * connections even when all local ephemeral ports are already * a part of some TCP connection. This makes no difference * for UDP sockets, but also doesn't harm them. * * If we're asking for any reserved port (i.e. port == 0 && * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ if (port <= 0) return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; break; } last = port; port = xs_next_srcport(transport, port); if (port > last) nloop++; } while (err == -EADDRINUSE && nloop != 2); if (myaddr.ss_family == AF_INET) dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__, &((struct sockaddr_in *)&myaddr)->sin_addr, port, err ? "failed" : "ok", err); else dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__, &((struct sockaddr_in6 *)&myaddr)->sin6_addr, port, err ? "failed" : "ok", err); return err; } /* * We don't support autobind on AF_LOCAL sockets */ static void xs_local_rpcbind(struct rpc_task *task) { xprt_set_bound(task->tk_xprt); } static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) { } #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key xs_key[3]; static struct lock_class_key xs_slock_key[3]; static inline void xs_reclassify_socketu(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", &xs_slock_key[0], "sk_lock-AF_LOCAL-RPC", &xs_key[0]); } static inline void xs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", &xs_slock_key[1], "sk_lock-AF_INET-RPC", &xs_key[1]); } static inline void xs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", &xs_slock_key[2], "sk_lock-AF_INET6-RPC", &xs_key[2]); } static inline void xs_reclassify_socket(int family, struct socket *sock) { if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk))) return; switch (family) { case AF_LOCAL: xs_reclassify_socketu(sock); break; case AF_INET: xs_reclassify_socket4(sock); break; case AF_INET6: xs_reclassify_socket6(sock); break; } } #else static inline void xs_reclassify_socket(int family, struct socket *sock) { } #endif static void xs_dummy_setup_socket(struct work_struct *work) { } static struct socket *xs_create_sock(struct rpc_xprt *xprt, struct sock_xprt *transport, int family, int type, int protocol, bool reuseport) { struct file *filp; struct socket *sock; int err; err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1); if (err < 0) { dprintk("RPC: can't create %d transport socket (%d).\n", protocol, -err); goto out; } xs_reclassify_socket(family, sock); if (reuseport) sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { sock_release(sock); goto out; } if (protocol == IPPROTO_TCP) { __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); sock->sk->sk_net_refcnt = 1; get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(xprt->xprt_net, 1); } filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) return ERR_CAST(filp); transport->file = filp; return sock; out: return ERR_PTR(err); } static int xs_local_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; sk->sk_use_task_frag = false; xprt_clear_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; release_sock(sk); } xs_stream_start_connect(transport); return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } /** * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint * @transport: socket transport to connect */ static int xs_local_setup_socket(struct sock_xprt *transport) { struct rpc_xprt *xprt = &transport->xprt; struct file *filp; struct socket *sock; int status; status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); if (status < 0) { dprintk("RPC: can't create AF_LOCAL " "transport socket (%d).\n", -status); goto out; } xs_reclassify_socket(AF_LOCAL, sock); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) { status = PTR_ERR(filp); goto out; } transport->file = filp; dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); status = xs_local_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, status); switch (status) { case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); break; case -ENOBUFS: break; case -ENOENT: dprintk("RPC: xprt %p: socket %s does not exist\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); break; case -ECONNREFUSED: dprintk("RPC: xprt %p: connection refused for %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); break; default: printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n", __func__, -status, xprt->address_strings[RPC_DISPLAY_ADDR]); } out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); return status; } static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; if (transport->file) goto force_disconnect; if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the * filesystem namespace of the process making the rpc * call. Thus we connect synchronously. * * If we want to support asynchronous AF_LOCAL calls, * we'll need to figure out how to pass a namespace to * connect. */ rpc_task_set_rpc_status(task, -ENOTCONN); goto out_wake; } ret = xs_local_setup_socket(transport); if (ret && !RPC_IS_SOFTCONN(task)) msleep_interruptible(15000); return; force_disconnect: xprt_force_disconnect(xprt); out_wake: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); } #if IS_ENABLED(CONFIG_SUNRPC_SWAP) /* * Note that this should be called with XPRT_LOCKED held, or recv_mutex * held, or when we otherwise know that we have exclusive access to the * socket, to guard against races with xs_reset_transport. */ static void xs_set_memalloc(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); /* * If there's no sock, then we have nothing to set. The * reconnecting process will get it for us. */ if (!transport->inet) return; if (atomic_read(&xprt->swapper)) sk_set_memalloc(transport->inet); } /** * xs_enable_swap - Tag this transport as being used for swap. * @xprt: transport to tag * * Take a reference to this transport on behalf of the rpc_clnt, and * optionally mark it for swapping if it wasn't already. */ static int xs_enable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); mutex_lock(&xs->recv_mutex); if (atomic_inc_return(&xprt->swapper) == 1 && xs->inet) sk_set_memalloc(xs->inet); mutex_unlock(&xs->recv_mutex); return 0; } /** * xs_disable_swap - Untag this transport as being used for swap. * @xprt: transport to tag * * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the * swapper refcount goes to 0, untag the socket as a memalloc socket. */ static void xs_disable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); mutex_lock(&xs->recv_mutex); if (atomic_dec_and_test(&xprt->swapper) && xs->inet) sk_clear_memalloc(xs->inet); mutex_unlock(&xs->recv_mutex); } #else static void xs_set_memalloc(struct rpc_xprt *xprt) { } static int xs_enable_swap(struct rpc_xprt *xprt) { return -EINVAL; } static void xs_disable_swap(struct rpc_xprt *xprt) { } #endif static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_use_task_frag = false; xprt_set_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; xs_set_memalloc(xprt); release_sock(sk); } xs_udp_do_set_buffer_size(xprt); xprt->stat.connect_start = jiffies; } static void xs_udp_setup_socket(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock; int status = -EIO; unsigned int pflags = current->flags; if (atomic_read(&xprt->swapper)) current->flags |= PF_MEMALLOC; sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP, false); if (IS_ERR(sock)) goto out; dprintk("RPC: worker connecting xprt %p via %s to " "%s (port %s)\n", xprt, xprt->address_strings[RPC_DISPLAY_PROTO], xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); xs_udp_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, 0); status = 0; out: xprt_clear_connecting(xprt); xprt_unlock_connect(xprt, transport); xprt_wake_pending_tasks(xprt, status); current_restore_flags(pflags, PF_MEMALLOC); } /** * xs_tcp_shutdown - gracefully shut down a TCP socket * @xprt: transport * * Initiates a graceful shutdown of the TCP socket by calling the * equivalent of shutdown(SHUT_RDWR); */ static void xs_tcp_shutdown(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct socket *sock = transport->sock; int skst = transport->inet ? transport->inet->sk_state : TCP_CLOSE; if (sock == NULL) return; if (!xprt->reuseport) { xs_close(xprt); return; } switch (skst) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: case TCP_LAST_ACK: break; case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; default: xs_reset_transport(transport); } } static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct net *net = sock_net(sock->sk); unsigned long connect_timeout; unsigned long syn_retries; unsigned int keepidle; unsigned int keepcnt; unsigned int timeo; unsigned long t; spin_lock(&xprt->transport_lock); keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); keepcnt = xprt->timeout->to_retries + 1; timeo = jiffies_to_msecs(xprt->timeout->to_initval) * (xprt->timeout->to_retries + 1); clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); tcp_sock_set_keepintvl(sock->sk, keepidle); tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); /* Connect timeout */ connect_timeout = max_t(unsigned long, DIV_ROUND_UP(xprt->connect_timeout, HZ), 1); syn_retries = max_t(unsigned long, READ_ONCE(net->ipv4.sysctl_tcp_syn_retries), 1); for (t = 0; t <= syn_retries && (1UL << t) < connect_timeout; t++) ; if (t <= syn_retries) tcp_sock_set_syncnt(sock->sk, t - 1); } static void xs_tcp_do_set_connect_timeout(struct rpc_xprt *xprt, unsigned long connect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct rpc_timeout to; unsigned long initval; memcpy(&to, xprt->timeout, sizeof(to)); /* Arbitrary lower limit */ initval = max_t(unsigned long, connect_timeout, XS_TCP_INIT_REEST_TO); to.to_initval = initval; to.to_maxval = initval; to.to_retries = 0; memcpy(&transport->tcp_timeout, &to, sizeof(transport->tcp_timeout)); xprt->timeout = &transport->tcp_timeout; xprt->connect_timeout = connect_timeout; } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, unsigned long connect_timeout, unsigned long reconnect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); spin_lock(&xprt->transport_lock); if (reconnect_timeout < xprt->max_reconnect_timeout) xprt->max_reconnect_timeout = reconnect_timeout; if (connect_timeout < xprt->connect_timeout) xs_tcp_do_set_connect_timeout(xprt, connect_timeout); set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); } static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: * Individual applications, which have specific * knowledge about the normal duration of connections, * MAY override this as appropriate. */ if (xs_addr(xprt)->sa_family == PF_INET6) { ip6_sock_set_addr_preferences(sk, IPV6_PREFER_SRC_PUBLIC); } xs_tcp_set_socket_timeouts(xprt, sock); tcp_sock_set_nodelay(sk); lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; sk->sk_use_task_frag = false; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); xprt_clear_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; release_sock(sk); } if (!xprt_bound(xprt)) return -ENOTCONN; xs_set_memalloc(xprt); xs_stream_start_connect(transport); /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); } /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint * @work: queued work item * * Invoked by a work queue tasklet. */ static void xs_tcp_setup_socket(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct socket *sock = transport->sock; struct rpc_xprt *xprt = &transport->xprt; int status; unsigned int pflags = current->flags; if (atomic_read(&xprt->swapper)) current->flags |= PF_MEMALLOC; if (xprt_connected(xprt)) goto out; if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state) || !sock) { xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { xprt_wake_pending_tasks(xprt, PTR_ERR(sock)); goto out; } } dprintk("RPC: worker connecting xprt %p via %s to " "%s (port %s)\n", xprt, xprt->address_strings[RPC_DISPLAY_PROTO], xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); status = xs_tcp_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, status); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { case 0: case -EINPROGRESS: /* SYN_SENT! */ set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state); if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; fallthrough; case -EALREADY: goto out_unlock; case -EADDRNOTAVAIL: /* Source port number is unavailable. Try a new one! */ transport->srcport = 0; status = -EAGAIN; break; case -EPERM: /* Happens, for instance, if a BPF program is preventing * the connect. Remap the error so upper layers can better * deal with it. */ status = -ECONNREFUSED; fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. */ case -ECONNREFUSED: case -ECONNRESET: case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: case -ENOTCONN: break; default: printk("%s: connect returned unhandled error %d\n", __func__, status); status = -EAGAIN; } /* xs_tcp_force_close() wakes tasks with a fixed error code. * We need to wake them first to ensure the correct error code. */ xprt_wake_pending_tasks(xprt, status); xs_tcp_force_close(xprt); out: xprt_clear_connecting(xprt); out_unlock: xprt_unlock_connect(xprt, transport); current_restore_flags(pflags, PF_MEMALLOC); } /* * Transfer the connected socket to @upper_transport, then mark that * xprt CONNECTED. */ static int xs_tcp_tls_finish_connecting(struct rpc_xprt *lower_xprt, struct sock_xprt *upper_transport) { struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); struct rpc_xprt *upper_xprt = &upper_transport->xprt; if (!upper_transport->inet) { struct socket *sock = lower_transport->sock; struct sock *sk = sock->sk; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: * Individual applications, which have specific * knowledge about the normal duration of connections, * MAY override this as appropriate. */ if (xs_addr(upper_xprt)->sa_family == PF_INET6) ip6_sock_set_addr_preferences(sk, IPV6_PREFER_SRC_PUBLIC); xs_tcp_set_socket_timeouts(upper_xprt, sock); tcp_sock_set_nodelay(sk); lock_sock(sk); /* @sk is already connected, so it now has the RPC callbacks. * Reach into @lower_transport to save the original ones. */ upper_transport->old_data_ready = lower_transport->old_data_ready; upper_transport->old_state_change = lower_transport->old_state_change; upper_transport->old_write_space = lower_transport->old_write_space; upper_transport->old_error_report = lower_transport->old_error_report; sk->sk_user_data = upper_xprt; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); xprt_clear_connected(upper_xprt); upper_transport->sock = sock; upper_transport->inet = sk; upper_transport->file = lower_transport->file; release_sock(sk); /* Reset lower_transport before shutting down its clnt */ mutex_lock(&lower_transport->recv_mutex); lower_transport->inet = NULL; lower_transport->sock = NULL; lower_transport->file = NULL; xprt_clear_connected(lower_xprt); xs_sock_reset_connection_flags(lower_xprt); xs_stream_reset_connect(lower_transport); mutex_unlock(&lower_transport->recv_mutex); } if (!xprt_bound(upper_xprt)) return -ENOTCONN; xs_set_memalloc(upper_xprt); if (!xprt_test_and_set_connected(upper_xprt)) { upper_xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); xprt_clear_connecting(upper_xprt); upper_xprt->stat.connect_count++; upper_xprt->stat.connect_time += (long)jiffies - upper_xprt->stat.connect_start; xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); } return 0; } /** * xs_tls_handshake_done - TLS handshake completion handler * @data: address of xprt to wake * @status: status of handshake * @peerid: serial number of key containing the remote's identity * */ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) { struct rpc_xprt *lower_xprt = data; struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); lower_transport->xprt_err = status ? -EACCES : 0; complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_parms *xprtsec) { struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); struct tls_handshake_args args = { .ta_sock = lower_transport->sock, .ta_done = xs_tls_handshake_done, .ta_data = xprt_get(lower_xprt), .ta_peername = lower_xprt->servername, }; struct sock *sk = lower_transport->inet; int rc; init_completion(&lower_transport->handshake_done); set_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); lower_transport->xprt_err = -ETIMEDOUT; switch (xprtsec->policy) { case RPC_XPRTSEC_TLS_ANON: rc = tls_client_hello_anon(&args, GFP_KERNEL); if (rc) goto out_put_xprt; break; case RPC_XPRTSEC_TLS_X509: args.ta_my_cert = xprtsec->cert_serial; args.ta_my_privkey = xprtsec->privkey_serial; rc = tls_client_hello_x509(&args, GFP_KERNEL); if (rc) goto out_put_xprt; break; default: rc = -EACCES; goto out_put_xprt; } rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, XS_TLS_HANDSHAKE_TO); if (rc <= 0) { tls_handshake_cancel(sk); if (rc == 0) rc = -ETIMEDOUT; goto out_put_xprt; } rc = lower_transport->xprt_err; out: xs_stream_reset_connect(lower_transport); clear_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); return rc; out_put_xprt: xprt_put(lower_xprt); goto out; } /** * xs_tcp_tls_setup_socket - establish a TLS session on a TCP socket * @work: queued work item * * Invoked by a work queue tasklet. * * For RPC-with-TLS, there is a two-stage connection process. * * The "upper-layer xprt" is visible to the RPC consumer. Once it has * been marked connected, the consumer knows that a TCP connection and * a TLS session have been established. * * A "lower-layer xprt", created in this function, handles the mechanics * of connecting the TCP socket, performing the RPC_AUTH_TLS probe, and * then driving the TLS handshake. Once all that is complete, the upper * layer xprt is marked connected. */ static void xs_tcp_tls_setup_socket(struct work_struct *work) { struct sock_xprt *upper_transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_clnt *upper_clnt = upper_transport->clnt; struct rpc_xprt *upper_xprt = &upper_transport->xprt; struct rpc_create_args args = { .net = upper_xprt->xprt_net, .protocol = upper_xprt->prot, .address = (struct sockaddr *)&upper_xprt->addr, .addrsize = upper_xprt->addrlen, .timeout = upper_clnt->cl_timeout, .servername = upper_xprt->servername, .program = upper_clnt->cl_program, .prognumber = upper_clnt->cl_prog, .version = upper_clnt->cl_vers, .authflavor = RPC_AUTH_TLS, .cred = upper_clnt->cl_cred, .xprtsec = { .policy = RPC_XPRTSEC_NONE, }, .stats = upper_clnt->cl_stats, }; unsigned int pflags = current->flags; struct rpc_clnt *lower_clnt; struct rpc_xprt *lower_xprt; int status; if (atomic_read(&upper_xprt->swapper)) current->flags |= PF_MEMALLOC; xs_stream_start_connect(upper_transport); /* This implicitly sends an RPC_AUTH_TLS probe */ lower_clnt = rpc_create(&args); if (IS_ERR(lower_clnt)) { trace_rpc_tls_unavailable(upper_clnt, upper_xprt); clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); xprt_clear_connecting(upper_xprt); xprt_wake_pending_tasks(upper_xprt, PTR_ERR(lower_clnt)); xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); goto out_unlock; } /* RPC_AUTH_TLS probe was successful. Try a TLS handshake on * the lower xprt. */ rcu_read_lock(); lower_xprt = rcu_dereference(lower_clnt->cl_xprt); rcu_read_unlock(); if (wait_on_bit_lock(&lower_xprt->state, XPRT_LOCKED, TASK_KILLABLE)) goto out_unlock; status = xs_tls_handshake_sync(lower_xprt, &upper_xprt->xprtsec); if (status) { trace_rpc_tls_not_started(upper_clnt, upper_xprt); goto out_close; } status = xs_tcp_tls_finish_connecting(lower_xprt, upper_transport); if (status) goto out_close; xprt_release_write(lower_xprt, NULL); trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); if (!xprt_test_and_set_connected(upper_xprt)) { upper_xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); xprt_clear_connecting(upper_xprt); upper_xprt->stat.connect_count++; upper_xprt->stat.connect_time += (long)jiffies - upper_xprt->stat.connect_start; xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); } rpc_shutdown_client(lower_clnt); out_unlock: current_restore_flags(pflags, PF_MEMALLOC); upper_transport->clnt = NULL; xprt_unlock_connect(upper_xprt, upper_transport); return; out_close: xprt_release_write(lower_xprt, NULL); rpc_shutdown_client(lower_clnt); /* xprt_force_disconnect() wakes tasks with a fixed tk_status code. * Wake them first here to ensure they get our tk_status code. */ xprt_wake_pending_tasks(upper_xprt, status); xs_tcp_force_close(upper_xprt); xprt_clear_connecting(upper_xprt); goto out_unlock; } /** * xs_connect - connect a socket to a remote endpoint * @xprt: pointer to transport structure * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. * * UDP socket connects are synchronous, but we use a work queue anyway * to guarantee that even unprivileged user processes can set up a * socket on a privileged port. * * If a UDP socket connect fails, the delay behavior here prevents * retry floods (hard mounts). */ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned long delay = 0; WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); } else dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); transport->clnt = task->tk_client; queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, delay); } static void xs_wake_disconnect(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state)) xs_tcp_force_close(&transport->xprt); } static void xs_wake_write(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state)) xprt_write_space(&transport->xprt); } static void xs_wake_error(struct sock_xprt *transport) { int sockerr; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) { xprt_wake_pending_tasks(&transport->xprt, sockerr); xs_tcp_force_close(&transport->xprt); } } static void xs_wake_pending(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state)) xprt_wake_pending_tasks(&transport->xprt, -EAGAIN); } static void xs_error_handle(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, error_worker); xs_wake_disconnect(transport); xs_wake_write(transport); xs_wake_error(transport); xs_wake_pending(transport); } /** * xs_local_print_stats - display AF_LOCAL socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { long idle_time = 0; if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu " "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /** * xs_udp_print_stats - display UDP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %llu %llu " "%lu %llu %llu\n", transport->srcport, xprt->stat.bind_count, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /** * xs_tcp_print_stats - display TCP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); long idle_time = 0; if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu " "%llu %llu %lu %llu %llu\n", transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /* * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ static int bc_malloc(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", size); return -EINVAL; } page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (!page) return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; rqst->rq_buffer = buf->data; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } /* * Free the space allocated in the bc_alloc routine */ static void bc_free(struct rpc_task *task) { void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } static int bc_sendto(struct rpc_rqst *req) { struct xdr_buf *xdr = &req->rq_snd_buf; struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct msghdr msg = { .msg_flags = 0, }; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); unsigned int sent = 0; int err; req->rq_xtime = ktime_get(); err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); if (err < 0) return err; err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent); xdr_free_bvec(xdr); if (err < 0 || sent != (xdr->len + sizeof(marker))) return -EAGAIN; return sent; } /** * bc_send_request - Send a backchannel Call on a TCP socket * @req: rpc_rqst containing Call message to be sent * * xpt_mutex ensures @rqstp's whole message is written to the socket * without interruption. * * Return values: * %0 if the message was sent successfully * %ENOTCONN if the message was not sent */ static int bc_send_request(struct rpc_rqst *req) { struct svc_xprt *xprt; int len; /* * Get the server socket associated with this callback xprt */ xprt = req->rq_xprt->bc_xprt; /* * Grab the mutex to serialize data as the connection is shared * with the fore channel */ mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else len = bc_sendto(req); mutex_unlock(&xprt->xpt_mutex); if (len > 0) len = 0; return len; } static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt); xs_xprt_free(xprt); module_put(THIS_MODULE); } static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, }; static const struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .get_srcaddr = xs_sock_srcaddr, .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_rtt, .timer = xs_udp_timer, .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_udp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, }; static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .get_srcaddr = xs_sock_srcaddr, .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, .set_connect_timeout = xs_tcp_set_connect_timeout, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_num_slots = xprt_bc_max_slots, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, #endif }; /* * The rpc_xprt_ops for the server backchannel */ static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .buf_alloc = bc_malloc, .buf_free = bc_free, .send_request = bc_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, }; static int xs_init_anyaddr(const int family, struct sockaddr *sap) { static const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; static const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, }; switch (family) { case AF_LOCAL: break; case AF_INET: memcpy(sap, &sin, sizeof(sin)); break; case AF_INET6: memcpy(sap, &sin6, sizeof(sin6)); break; default: dprintk("RPC: %s: Bad address family\n", __func__); return -EAFNOSUPPORT; } return 0; } static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, unsigned int slot_table_size, unsigned int max_slot_table_size) { struct rpc_xprt *xprt; struct sock_xprt *new; if (args->addrlen > sizeof(xprt->addr)) { dprintk("RPC: xs_setup_xprt: address too large\n"); return ERR_PTR(-EBADF); } xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size, max_slot_table_size); if (xprt == NULL) { dprintk("RPC: xs_setup_xprt: couldn't allocate " "rpc_xprt\n"); return ERR_PTR(-ENOMEM); } new = container_of(xprt, struct sock_xprt, xprt); mutex_init(&new->recv_mutex); memcpy(&xprt->addr, args->dstaddr, args->addrlen); xprt->addrlen = args->addrlen; if (args->srcaddr) memcpy(&new->srcaddr, args->srcaddr, args->addrlen); else { int err; err = xs_init_anyaddr(args->dstaddr->sa_family, (struct sockaddr *)&new->srcaddr); if (err != 0) { xprt_free(xprt); return ERR_PTR(err); } } return xprt; } static const struct rpc_timeout xs_local_default_timeout = { .to_initval = 10 * HZ, .to_maxval = 10 * HZ, .to_retries = 2, }; /** * xs_setup_local - Set up transport to use an AF_LOCAL socket * @args: rpc transport creation arguments * * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP */ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) { struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr; struct sock_xprt *transport; struct rpc_xprt *xprt; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_max_tcp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = 0; xprt->xprt_class = &xs_local_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: if (sun->sun_path[0] != '/' && sun->sun_path[0] != '\0') { dprintk("RPC: bad AF_LOCAL address: %s\n", sun->sun_path); ret = ERR_PTR(-EINVAL); goto out_err; } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } dprintk("RPC: set up xprt to %s via AF_LOCAL\n", xprt->address_strings[RPC_DISPLAY_ADDR]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static const struct rpc_timeout xs_udp_default_timeout = { .to_initval = 5 * HZ, .to_maxval = 30 * HZ, .to_increment = 5 * HZ, .to_retries = 5, }; /** * xs_setup_udp - Set up transport to use a UDP socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries, xprt_udp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_UDP; xprt->xprt_class = &xs_udp_transport; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_UDP_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_udp_ops; xprt->timeout = &xs_udp_default_timeout; INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket); switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static const struct rpc_timeout xs_tcp_default_timeout = { .to_initval = 60 * HZ, .to_maxval = 60 * HZ, .to_retries = 2, }; /** * xs_setup_tcp - Set up transport to use a TCP socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; if (args->flags & XPRT_CREATE_INFINITE_SLOTS) max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, max_slot_table_size); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; if (args->reconnect_timeout) xprt->max_reconnect_timeout = args->reconnect_timeout; xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); if (args->connect_timeout) xs_tcp_do_set_connect_timeout(xprt, args->connect_timeout); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } /** * xs_setup_tcp_tls - Set up transport to use a TCP with TLS * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_tcp_tls(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; if (args->flags & XPRT_CREATE_INFINITE_SLOTS) max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, max_slot_table_size); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); switch (args->xprtsec.policy) { case RPC_XPRTSEC_TLS_ANON: case RPC_XPRTSEC_TLS_X509: xprt->xprtsec = args->xprtsec; INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_tls_setup_socket); break; default: ret = ERR_PTR(-EACCES); goto out_err; } switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } /** * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct svc_sock *bc_sock; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_tcp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_bc_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->timeout = &xs_tcp_default_timeout; /* backchannel */ xprt_set_bound(xprt); xprt->bind_timeout = 0; xprt->reestablish_timeout = 0; xprt->idle_timeout = 0; xprt->ops = &bc_tcp_ops; switch (addr->sa_family) { case AF_INET: xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); /* * Once we've associated a backchannel xprt with a connection, * we want to keep it around as long as the connection lasts, * in case we need to start using it for a backchannel again; * this reference won't be dropped until bc_xprt is destroyed. */ xprt_get(xprt); args->bc_xprt->xpt_bc_xprt = xprt; xprt->bc_xprt = args->bc_xprt; bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt); transport->sock = bc_sock->sk_sock; transport->inet = bc_sock->sk_sk; /* * Since we don't want connections for the backchannel, we set * the xprt status to connected */ xprt_set_connected(xprt); if (try_module_get(THIS_MODULE)) return xprt; args->bc_xprt->xpt_bc_xprt = NULL; args->bc_xprt->xpt_bc_xps = NULL; xprt_put(xprt); ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static struct xprt_class xs_local_transport = { .list = LIST_HEAD_INIT(xs_local_transport.list), .name = "named UNIX socket", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, .netid = { "" }, }; static struct xprt_class xs_udp_transport = { .list = LIST_HEAD_INIT(xs_udp_transport.list), .name = "udp", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { .list = LIST_HEAD_INIT(xs_tcp_transport.list), .name = "tcp", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_tcp_tls_transport = { .list = LIST_HEAD_INIT(xs_tcp_tls_transport.list), .name = "tcp-with-tls", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP_TLS, .setup = xs_setup_tcp_tls, .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list), .name = "tcp NFSv4.1 backchannel", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, .netid = { "" }, }; /** * init_socket_xprt - set up xprtsock's sysctls, register with RPC client * */ int init_socket_xprt(void) { if (!sunrpc_table_header) sunrpc_table_header = register_sysctl("sunrpc", xs_tunables_table); xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); xprt_register_transport(&xs_tcp_transport); xprt_register_transport(&xs_tcp_tls_transport); xprt_register_transport(&xs_bc_tcp_transport); return 0; } /** * cleanup_socket_xprt - remove xprtsock's sysctls, unregister * */ void cleanup_socket_xprt(void) { if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); xprt_unregister_transport(&xs_tcp_transport); xprt_unregister_transport(&xs_tcp_tls_transport); xprt_unregister_transport(&xs_bc_tcp_transport); } static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; #define param_check_portnr(name, p) \ __param_check(name, p, unsigned int); module_param_named(min_resvport, xprt_min_resvport, portnr, 0644); module_param_named(max_resvport, xprt_max_resvport, portnr, 0644); static int param_set_slot_table_size(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_SLOT_TABLE, RPC_MAX_SLOT_TABLE); } static const struct kernel_param_ops param_ops_slot_table_size = { .set = param_set_slot_table_size, .get = param_get_uint, }; #define param_check_slot_table_size(name, p) \ __param_check(name, p, unsigned int); static int param_set_max_slot_table_size(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_SLOT_TABLE, RPC_MAX_SLOT_TABLE_LIMIT); } static const struct kernel_param_ops param_ops_max_slot_table_size = { .set = param_set_max_slot_table_size, .get = param_get_uint, }; #define param_check_max_slot_table_size(name, p) \ __param_check(name, p, unsigned int); module_param_named(tcp_slot_table_entries, xprt_tcp_slot_table_entries, slot_table_size, 0644); module_param_named(tcp_max_slot_table_entries, xprt_max_tcp_slot_table_entries, max_slot_table_size, 0644); module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries, slot_table_size, 0644);
3 2 1 1 3 3 2 2 1 3 3 2 1 2 2 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 // SPDX-License-Identifier: GPL-2.0-only /* * GHASH: hash function for GCM (Galois/Counter Mode). * * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> * Copyright (c) 2009 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ /* * GHASH is a keyed hash function used in GCM authentication tag generation. * * The original GCM paper [1] presents GHASH as a function GHASH(H, A, C) which * takes a 16-byte hash key H, additional authenticated data A, and a ciphertext * C. It formats A and C into a single byte string X, interprets X as a * polynomial over GF(2^128), and evaluates this polynomial at the point H. * * However, the NIST standard for GCM [2] presents GHASH as GHASH(H, X) where X * is the already-formatted byte string containing both A and C. * * "ghash" in the Linux crypto API uses the 'X' (pre-formatted) convention, * since the API supports only a single data stream per hash. Thus, the * formatting of 'A' and 'C' is done in the "gcm" template, not in "ghash". * * The reason "ghash" is separate from "gcm" is to allow "gcm" to use an * accelerated "ghash" when a standalone accelerated "gcm(aes)" is unavailable. * It is generally inappropriate to use "ghash" for other purposes, since it is * an "ε-almost-XOR-universal hash function", not a cryptographic hash function. * It can only be used securely in crypto modes specially designed to use it. * * [1] The Galois/Counter Mode of Operation (GCM) * (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.694.695&rep=rep1&type=pdf) * [2] Recommendation for Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC * (https://csrc.nist.gov/publications/detail/sp/800-38d/final) */ #include <crypto/algapi.h> #include <crypto/gf128mul.h> #include <crypto/ghash.h> #include <crypto/internal/hash.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); memset(dctx, 0, sizeof(*dctx)); return 0; } static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); be128 k; if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; if (ctx->gf128) gf128mul_free_4k(ctx->gf128); BUILD_BUG_ON(sizeof(k) != GHASH_BLOCK_SIZE); memcpy(&k, key, GHASH_BLOCK_SIZE); /* avoid violating alignment rules */ ctx->gf128 = gf128mul_init_4k_lle(&k); memzero_explicit(&k, GHASH_BLOCK_SIZE); if (!ctx->gf128) return -ENOMEM; return 0; } static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; if (dctx->bytes) { int n = min(srclen, dctx->bytes); u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); dctx->bytes -= n; srclen -= n; while (n--) *pos++ ^= *src++; if (!dctx->bytes) gf128mul_4k_lle((be128 *)dst, ctx->gf128); } while (srclen >= GHASH_BLOCK_SIZE) { crypto_xor(dst, src, GHASH_BLOCK_SIZE); gf128mul_4k_lle((be128 *)dst, ctx->gf128); src += GHASH_BLOCK_SIZE; srclen -= GHASH_BLOCK_SIZE; } if (srclen) { dctx->bytes = GHASH_BLOCK_SIZE - srclen; while (srclen--) *dst++ ^= *src++; } return 0; } static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) { u8 *dst = dctx->buffer; if (dctx->bytes) { u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); while (dctx->bytes--) *tmp++ ^= 0; gf128mul_4k_lle((be128 *)dst, ctx->gf128); } dctx->bytes = 0; } static int ghash_final(struct shash_desc *desc, u8 *dst) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; ghash_flush(ctx, dctx); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; } static void ghash_exit_tfm(struct crypto_tfm *tfm) { struct ghash_ctx *ctx = crypto_tfm_ctx(tfm); if (ctx->gf128) gf128mul_free_4k(ctx->gf128); } static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, .final = ghash_final, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-generic", .cra_priority = 100, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, .cra_exit = ghash_exit_tfm, }, }; static int __init ghash_mod_init(void) { return crypto_register_shash(&ghash_alg); } static void __exit ghash_mod_exit(void) { crypto_unregister_shash(&ghash_alg); } subsys_initcall(ghash_mod_init); module_exit(ghash_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH hash function"); MODULE_ALIAS_CRYPTO("ghash"); MODULE_ALIAS_CRYPTO("ghash-generic");
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 /* * Key Wrapping: RFC3394 / NIST SP800-38F * * Copyright (C) 2015, Stephan Mueller <smueller@chronox.de> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 * are required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Note for using key wrapping: * * * The result of the encryption operation is the ciphertext starting * with the 2nd semiblock. The first semiblock is provided as the IV. * The IV used to start the encryption operation is the default IV. * * * The input for the decryption is the first semiblock handed in as an * IV. The ciphertext is the data starting with the 2nd semiblock. The * return code of the decryption operation will be EBADMSG in case an * integrity error occurs. * * To obtain the full result of an encryption as expected by SP800-38F, the * caller must allocate a buffer of plaintext + 8 bytes: * * unsigned int datalen = ptlen + crypto_skcipher_ivsize(tfm); * u8 data[datalen]; * u8 *iv = data; * u8 *pt = data + crypto_skcipher_ivsize(tfm); * <ensure that pt contains the plaintext of size ptlen> * sg_init_one(&sg, pt, ptlen); * skcipher_request_set_crypt(req, &sg, &sg, ptlen, iv); * * ==> After encryption, data now contains full KW result as per SP800-38F. * * In case of decryption, ciphertext now already has the expected length * and must be segmented appropriately: * * unsigned int datalen = CTLEN; * u8 data[datalen]; * <ensure that data contains full ciphertext> * u8 *iv = data; * u8 *ct = data + crypto_skcipher_ivsize(tfm); * unsigned int ctlen = datalen - crypto_skcipher_ivsize(tfm); * sg_init_one(&sg, ct, ctlen); * skcipher_request_set_crypt(req, &sg, &sg, ctlen, iv); * * ==> After decryption (which hopefully does not return EBADMSG), the ct * pointer now points to the plaintext of size ctlen. * * Note 2: KWP is not implemented as this would defy in-place operation. * If somebody wants to wrap non-aligned data, he should simply pad * the input with zeros to fill it up to the 8 byte boundary. */ #include <linux/module.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <crypto/scatterwalk.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> struct crypto_kw_block { #define SEMIBSIZE 8 __be64 A; __be64 R; }; /* * Fast forward the SGL to the "end" length minus SEMIBSIZE. * The start in the SGL defined by the fast-forward is returned with * the walk variable */ static void crypto_kw_scatterlist_ff(struct scatter_walk *walk, struct scatterlist *sg, unsigned int end) { unsigned int skip = 0; /* The caller should only operate on full SEMIBLOCKs. */ BUG_ON(end < SEMIBSIZE); skip = end - SEMIBSIZE; while (sg) { if (sg->length > skip) { scatterwalk_start(walk, sg); scatterwalk_advance(walk, skip); break; } skip -= sg->length; sg = sg_next(sg); } } static int crypto_kw_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct crypto_kw_block block; struct scatterlist *src, *dst; u64 t = 6 * ((req->cryptlen) >> 3); unsigned int i; int ret = 0; /* * Require at least 2 semiblocks (note, the 3rd semiblock that is * required by SP800-38F is the IV. */ if (req->cryptlen < (2 * SEMIBSIZE) || req->cryptlen % SEMIBSIZE) return -EINVAL; /* Place the IV into block A */ memcpy(&block.A, req->iv, SEMIBSIZE); /* * src scatterlist is read-only. dst scatterlist is r/w. During the * first loop, src points to req->src and dst to req->dst. For any * subsequent round, the code operates on req->dst only. */ src = req->src; dst = req->dst; for (i = 0; i < 6; i++) { struct scatter_walk src_walk, dst_walk; unsigned int nbytes = req->cryptlen; while (nbytes) { /* move pointer by nbytes in the SGL */ crypto_kw_scatterlist_ff(&src_walk, src, nbytes); /* get the source block */ scatterwalk_copychunks(&block.R, &src_walk, SEMIBSIZE, false); /* perform KW operation: modify IV with counter */ block.A ^= cpu_to_be64(t); t--; /* perform KW operation: decrypt block */ crypto_cipher_decrypt_one(cipher, (u8 *)&block, (u8 *)&block); /* move pointer by nbytes in the SGL */ crypto_kw_scatterlist_ff(&dst_walk, dst, nbytes); /* Copy block->R into place */ scatterwalk_copychunks(&block.R, &dst_walk, SEMIBSIZE, true); nbytes -= SEMIBSIZE; } /* we now start to operate on the dst SGL only */ src = req->dst; dst = req->dst; } /* Perform authentication check */ if (block.A != cpu_to_be64(0xa6a6a6a6a6a6a6a6ULL)) ret = -EBADMSG; memzero_explicit(&block, sizeof(struct crypto_kw_block)); return ret; } static int crypto_kw_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct crypto_kw_block block; struct scatterlist *src, *dst; u64 t = 1; unsigned int i; /* * Require at least 2 semiblocks (note, the 3rd semiblock that is * required by SP800-38F is the IV that occupies the first semiblock. * This means that the dst memory must be one semiblock larger than src. * Also ensure that the given data is aligned to semiblock. */ if (req->cryptlen < (2 * SEMIBSIZE) || req->cryptlen % SEMIBSIZE) return -EINVAL; /* * Place the predefined IV into block A -- for encrypt, the caller * does not need to provide an IV, but he needs to fetch the final IV. */ block.A = cpu_to_be64(0xa6a6a6a6a6a6a6a6ULL); /* * src scatterlist is read-only. dst scatterlist is r/w. During the * first loop, src points to req->src and dst to req->dst. For any * subsequent round, the code operates on req->dst only. */ src = req->src; dst = req->dst; for (i = 0; i < 6; i++) { struct scatter_walk src_walk, dst_walk; unsigned int nbytes = req->cryptlen; scatterwalk_start(&src_walk, src); scatterwalk_start(&dst_walk, dst); while (nbytes) { /* get the source block */ scatterwalk_copychunks(&block.R, &src_walk, SEMIBSIZE, false); /* perform KW operation: encrypt block */ crypto_cipher_encrypt_one(cipher, (u8 *)&block, (u8 *)&block); /* perform KW operation: modify IV with counter */ block.A ^= cpu_to_be64(t); t++; /* Copy block->R into place */ scatterwalk_copychunks(&block.R, &dst_walk, SEMIBSIZE, true); nbytes -= SEMIBSIZE; } /* we now start to operate on the dst SGL only */ src = req->dst; dst = req->dst; } /* establish the IV for the caller to pick up */ memcpy(req->iv, &block.A, SEMIBSIZE); memzero_explicit(&block, sizeof(struct crypto_kw_block)); return 0; } static int crypto_kw_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_alg *alg; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); alg = skcipher_ialg_simple(inst); err = -EINVAL; /* Section 5.1 requirement for KW */ if (alg->cra_blocksize != sizeof(struct crypto_kw_block)) goto out_free_inst; inst->alg.base.cra_blocksize = SEMIBSIZE; inst->alg.base.cra_alignmask = 0; inst->alg.ivsize = SEMIBSIZE; inst->alg.encrypt = crypto_kw_encrypt; inst->alg.decrypt = crypto_kw_decrypt; err = skcipher_register_instance(tmpl, inst); if (err) { out_free_inst: inst->free(inst); } return err; } static struct crypto_template crypto_kw_tmpl = { .name = "kw", .create = crypto_kw_create, .module = THIS_MODULE, }; static int __init crypto_kw_init(void) { return crypto_register_template(&crypto_kw_tmpl); } static void __exit crypto_kw_exit(void) { crypto_unregister_template(&crypto_kw_tmpl); } subsys_initcall(crypto_kw_init); module_exit(crypto_kw_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>"); MODULE_DESCRIPTION("Key Wrapping (RFC3394 / NIST SP800-38F)"); MODULE_ALIAS_CRYPTO("kw"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
12 11 1 3 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/spinlock.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> static LIST_HEAD(ax25_dev_list); DEFINE_SPINLOCK(ax25_dev_lock); ax25_dev *ax25_addr_ax25dev(ax25_address *addr) { ax25_dev *ax25_dev, *res = NULL; spin_lock_bh(&ax25_dev_lock); list_for_each_entry(ax25_dev, &ax25_dev_list, list) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; ax25_dev_hold(ax25_dev); break; } spin_unlock_bh(&ax25_dev_lock); return res; } /* * This is called when an interface is brought up. These are * reasonable defaults. */ void ax25_dev_device_up(struct net_device *dev) { ax25_dev *ax25_dev; ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL); if (!ax25_dev) { printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); return; } refcount_set(&ax25_dev->refcount, 1); ax25_dev->dev = dev; netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL); ax25_dev->forward = NULL; ax25_dev->device_up = true; ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; #ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; #endif #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); dev->ax25_ptr = ax25_dev; spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); } void ax25_dev_device_down(struct net_device *dev) { ax25_dev *s, *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; ax25_unregister_dev_sysctl(ax25_dev); spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* * Remove any packet forwarding that points to this device. */ list_for_each_entry(s, &ax25_dev_list, list) if (s->forward == dev) s->forward = NULL; list_for_each_entry(s, &ax25_dev_list, list) { if (s == ax25_dev) { list_del(&s->list); break; } } dev->ax25_ptr = NULL; spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) { ax25_dev *ax25_dev, *fwd_dev; if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL) return -EINVAL; switch (cmd) { case SIOCAX25ADDFWD: fwd_dev = ax25_addr_ax25dev(&fwd->port_to); if (!fwd_dev) { ax25_dev_put(ax25_dev); return -EINVAL; } if (ax25_dev->forward) { ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: if (!ax25_dev->forward) { ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = NULL; ax25_dev_put(ax25_dev); break; default: ax25_dev_put(ax25_dev); return -EINVAL; } return 0; } struct net_device *ax25_fwd_dev(struct net_device *dev) { ax25_dev *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return dev; if (ax25_dev->forward == NULL) return dev; return ax25_dev->forward; } /* * Free all memory associated with device structures. */ void __exit ax25_dev_free(void) { ax25_dev *s, *n; spin_lock_bh(&ax25_dev_lock); list_for_each_entry_safe(s, n, &ax25_dev_list, list) { netdev_put(s->dev, &s->dev_tracker); list_del(&s->list); ax25_dev_put(s); } spin_unlock_bh(&ax25_dev_lock); }
3 3 1 1 2 2 1 2 1 2 2 1 1 1 1 1 2 2 1 1 1 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for assembler optimized version of 3DES * * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/des.h> #include <crypto/internal/skcipher.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> struct des3_ede_x86_ctx { struct des3_ede_ctx enc; struct des3_ede_ctx dec; }; /* regular block cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst, const u8 *src); /* 3-way parallel cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, const u8 *src); static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); } static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); } static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) { const unsigned int bsize = DES3_EDE_BLOCK_SIZE; struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ if (nbytes >= bsize * 3) { do { des3_ede_x86_64_crypt_blk_3way(expkey, wdst, wsrc); wsrc += bsize * 3; wdst += bsize * 3; nbytes -= bsize * 3; } while (nbytes >= bsize * 3); if (nbytes < bsize) goto done; } /* Handle leftovers */ do { des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc); wsrc += bsize; wdst += bsize; nbytes -= bsize; } while (nbytes >= bsize); done: err = skcipher_walk_done(&walk, nbytes); } return err; } static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->enc.expkey); } static int ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->dec.expkey); } static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 *iv = (u64 *)walk->iv; do { *dst = *src ^ *iv; des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst); iv = dst; src += 1; dst += 1; nbytes -= bsize; } while (nbytes >= bsize); *(u64 *)walk->iv = *iv; return nbytes; } static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 ivs[3 - 1]; u64 last_iv; /* Start of the last block. */ src += nbytes / bsize - 1; dst += nbytes / bsize - 1; last_iv = *src; /* Process four block batch */ if (nbytes >= bsize * 3) { do { nbytes -= bsize * 3 - bsize; src -= 3 - 1; dst -= 3 - 1; ivs[0] = src[0]; ivs[1] = src[1]; des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); dst[1] ^= ivs[0]; dst[2] ^= ivs[1]; nbytes -= bsize; if (nbytes < bsize) goto done; *dst ^= *(src - 1); src -= 1; dst -= 1; } while (nbytes >= bsize * 3); } /* Handle leftovers */ for (;;) { des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src); nbytes -= bsize; if (nbytes < bsize) break; *dst ^= *(src - 1); src -= 1; dst -= 1; } done: *dst ^= *(u64 *)walk->iv; *(u64 *)walk->iv = last_iv; return nbytes; } static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm); u32 i, j, tmp; int err; err = des3_ede_expand_key(&ctx->enc, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) { memset(ctx, 0, sizeof(*ctx)); return err; } /* Fix encryption context for this implementation and form decryption * context. */ j = DES3_EDE_EXPKEY_WORDS - 2; for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { tmp = ror32(ctx->enc.expkey[i + 1], 4); ctx->enc.expkey[i + 1] = tmp; ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0]; ctx->dec.expkey[j + 1] = tmp; } return 0; } static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return des3_ede_x86_setkey(&tfm->base, key, keylen); } static struct crypto_alg des3_ede_cipher = { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-asm", .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES3_EDE_KEY_SIZE, .cia_max_keysize = DES3_EDE_KEY_SIZE, .cia_setkey = des3_ede_x86_setkey, .cia_encrypt = des3_ede_x86_encrypt, .cia_decrypt = des3_ede_x86_decrypt, } } }; static struct skcipher_alg des3_ede_skciphers[] = { { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "ecb-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "cbc-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .ivsize = DES3_EDE_BLOCK_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static bool is_blacklisted_cpu(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; if (boot_cpu_data.x86 == 0x0f) { /* * On Pentium 4, des3_ede-x86_64 is slower than generic C * implementation because use of 64bit rotates (which are really * slow on P4). Therefore blacklist P4s. */ return true; } return false; } static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init des3_ede_x86_init(void) { int err; if (!force && is_blacklisted_cpu()) { pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); return -ENODEV; } err = crypto_register_alg(&des3_ede_cipher); if (err) return err; err = crypto_register_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); if (err) crypto_unregister_alg(&des3_ede_cipher); return err; } static void __exit des3_ede_x86_fini(void) { crypto_unregister_alg(&des3_ede_cipher); crypto_unregister_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); } module_init(des3_ede_x86_init); module_exit(des3_ede_x86_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
1 72 91 87 17 70 209 54 63 97 29 29 52 54 2 2 21 2 19 13 3 10 4 119 119 111 43 110 42 4 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); int err; err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return crypto_shash_alg(desc->tfm)->update(desc, data, len); } EXPORT_SYMBOL_GPL(crypto_shash_update); int crypto_shash_final(struct shash_desc *desc, u8 *out) { return crypto_shash_alg(desc->tfm)->final(desc, out); } EXPORT_SYMBOL_GPL(crypto_shash_final); static int shash_default_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->update(desc, data, len) ?: shash->final(desc, out); } int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_default_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->init(desc) ?: shash->finup(desc, data, len, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_alg(tfm)->digest(desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); int err; desc->tfm = tfm; err = crypto_shash_digest(desc, data, len, out); shash_desc_zero(desc); return err; } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); int crypto_shash_export(struct shash_desc *desc, void *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); if (shash->export) return shash->export(desc, out); memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(tfm)); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_export); int crypto_shash_import(struct shash_desc *desc, const void *in) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (shash->import) return shash->import(desc, in); memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(tfm)); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_import); static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); int err; hash->descsize = alg->descsize; shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; err = alg->init_tfm(hash); if (err) return err; /* ->init_tfm() may have increased the descsize. */ if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) { if (alg->exit_tfm) alg->exit_tfm(hash); return -EINVAL; } return 0; } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->descsize = hash->descsize; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; /* alignmask is not useful for hashes, so it is not supported. */ if (base->cra_alignmask) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; /* * Handle missing optional functions. For each one we can either * install a default here, or we can leave the pointer as NULL and check * the pointer for NULL in crypto_shash_*(), avoiding an indirect call * when the default behavior is desired. For ->finup and ->digest we * install defaults, since for optimal performance algorithms should * implement these anyway. On the other hand, for ->import and * ->export the common case and best performance comes from the simple * memcpy of the shash_desc_ctx, so when those pointers are NULL we * leave them NULL and provide the memcpy with no indirect call. */ if (!alg->finup) alg->finup = shash_default_finup; if (!alg->digest) alg->digest = shash_default_digest; if (!alg->export) alg->halg.statesize = alg->descsize; if (!alg->setkey) alg->setkey = shash_no_setkey; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type");
45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> static inline void tlb_flush(struct mmu_gather *tlb) { unsigned long start = 0UL, end = TLB_FLUSH_ALL; unsigned int stride_shift = tlb_get_unmap_shift(tlb); if (!tlb->fullmm && !tlb->need_flush_all) { start = tlb->start; end = tlb->end; } flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } /* * While x86 architecture in general requires an IPI to perform TLB * shootdown, enablement code for several hypervisors overrides * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing * a hypercall. To keep software pagetable walkers safe in this case we * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h * for more details. */ static inline void __tlb_remove_table(void *table) { free_page_and_swap_cache(table); } static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } #endif /* _ASM_X86_TLB_H */
4 4 4 4 4 4 4 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); u32 hash; u32 minor_hash; if (IS_CASEFOLDED(inode)) { hash = EXT4_DIRENT_HASH(de); minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hash = 0; minor_hash = 0; } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); info->cookie = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[] __counted_by(name_len); }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p = filp->private_data; if (is_dx_dir(file_inode(filp)) && !p->initialized) { p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); p->initialized = true; } } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } static int ext4_dir_open(struct inode *inode, struct file *file) { struct dir_private_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; file->private_data = info; return 0; } const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/rose.h> static int rose_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); if (daddr) memcpy(buff + 7, daddr, dev->addr_len); *buff++ = ROSE_GFI | ROSE_Q_BIT; *buff++ = 0x00; *buff++ = ROSE_DATA; *buff++ = 0x7F; *buff++ = AX25_P_IP; if (daddr != NULL) return 37; return -37; } static int rose_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = rose_add_loopback_node((rose_address *)sa->sa_data); if (err) return err; rose_del_loopback_node((const rose_address *)dev->dev_addr); } dev_addr_set(dev, sa->sa_data); return 0; } static int rose_open(struct net_device *dev) { int err; err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; netif_start_queue(dev); return 0; } static int rose_close(struct net_device *dev) { netif_stop_queue(dev); rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); return NETDEV_TX_BUSY; } if (!rose_route_frame(skb, NULL)) { dev_kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops rose_header_ops = { .create = rose_header, }; static const struct net_device_ops rose_netdev_ops = { .ndo_open = rose_open, .ndo_stop = rose_close, .ndo_start_xmit = rose_xmit, .ndo_set_mac_address = rose_set_mac_address, }; void rose_setup(struct net_device *dev) { dev->mtu = ROSE_MAX_PACKET_SIZE - 2; dev->netdev_ops = &rose_netdev_ops; dev->header_ops = &rose_header_ops; dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; dev->addr_len = ROSE_ADDR_LEN; dev->type = ARPHRD_ROSE; /* New-style flags. */ dev->flags = IFF_NOARP; }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 // SPDX-License-Identifier: GPL-2.0-only #include <linux/init.h> #include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/slab.h> #define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) #define SG_MEMPOOL_SIZE 2 struct sg_pool { size_t size; char *name; struct kmem_cache *slab; mempool_t *pool; }; #define SP(x) { .size = x, "sgpool-" __stringify(x) } #if (SG_CHUNK_SIZE < 32) #error SG_CHUNK_SIZE is too small (must be 32 or greater) #endif static struct sg_pool sg_pools[] = { SP(8), SP(16), #if (SG_CHUNK_SIZE > 32) SP(32), #if (SG_CHUNK_SIZE > 64) SP(64), #if (SG_CHUNK_SIZE > 128) SP(128), #if (SG_CHUNK_SIZE > 256) #error SG_CHUNK_SIZE is too large (256 MAX) #endif #endif #endif #endif SP(SG_CHUNK_SIZE) }; #undef SP static inline unsigned int sg_pool_index(unsigned short nents) { unsigned int index; BUG_ON(nents > SG_CHUNK_SIZE); if (nents <= 8) index = 0; else index = get_count_order(nents) - 3; return index; } static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); mempool_free(sgl, sgp->pool); } static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) { struct sg_pool *sgp; sgp = sg_pools + sg_pool_index(nents); return mempool_alloc(sgp->pool, gfp_mask); } /** * sg_free_table_chained - Free a previously mapped sg table * @table: The sg table header to use * @nents_first_chunk: size of the first_chunk SGL passed to * sg_alloc_table_chained * * Description: * Free an sg table previously allocated and setup with * sg_alloc_table_chained(). * * @nents_first_chunk has to be same with that same parameter passed * to sg_alloc_table_chained(). * **/ void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk) { if (table->orig_nents <= nents_first_chunk) return; if (nents_first_chunk == 1) nents_first_chunk = 0; __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free, table->orig_nents); } EXPORT_SYMBOL_GPL(sg_free_table_chained); /** * sg_alloc_table_chained - Allocate and chain SGLs in an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @first_chunk: first SGL * @nents_first_chunk: number of the SGL of @first_chunk * * Description: * Allocate and chain SGLs in an sg table. If @nents@ is larger than * @nents_first_chunk a chained sg table will be setup. @first_chunk is * ignored if nents_first_chunk <= 1 because user expects the SGL points * non-chain SGL. * **/ int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk) { int ret; BUG_ON(!nents); if (first_chunk && nents_first_chunk) { if (nents <= nents_first_chunk) { table->nents = table->orig_nents = nents; sg_init_table(table->sgl, nents); return 0; } } /* User supposes that the 1st SGL includes real entry */ if (nents_first_chunk <= 1) { first_chunk = NULL; nents_first_chunk = 0; } ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, first_chunk, nents_first_chunk, GFP_ATOMIC, sg_pool_alloc); if (unlikely(ret)) sg_free_table_chained(table, nents_first_chunk); return ret; } EXPORT_SYMBOL_GPL(sg_alloc_table_chained); static __init int sg_pool_init(void) { int i; for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); sgp->slab = kmem_cache_create(sgp->name, size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!sgp->slab) { printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", sgp->name); goto cleanup_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, sgp->slab); if (!sgp->pool) { printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", sgp->name); goto cleanup_sdb; } } return 0; cleanup_sdb: for (i = 0; i < SG_MEMPOOL_NR; i++) { struct sg_pool *sgp = sg_pools + i; mempool_destroy(sgp->pool); kmem_cache_destroy(sgp->slab); } return -ENOMEM; } subsys_initcall(sg_pool_init);
2372 2373 2371 2372 2384 2373 2383 2372 2383 2384 2384 2369 2379 2369 2372 2384 2374 2364 2384 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 // SPDX-License-Identifier: GPL-2.0+ /* * Base port operations for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * Split from 8250_core.c, Copyright (C) 2001 Russell King. * * A note about mapbase / membase * * mapbase is the physical address of the IO port. * membase is an 'ioremapped' cookie. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/console.h> #include <linux/gpio/consumer.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/platform_device.h> #include <linux/tty.h> #include <linux/ratelimit.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/nmi.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> #include <linux/ktime.h> #include <asm/io.h> #include <asm/irq.h> #include "8250.h" /* * Debugging. */ #if 0 #define DEBUG_AUTOCONF(fmt...) printk(fmt) #else #define DEBUG_AUTOCONF(fmt...) do { } while (0) #endif /* * Here we define the default xmit fifo size used for each type of UART. */ static const struct serial8250_config uart_config[] = { [PORT_UNKNOWN] = { .name = "unknown", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_8250] = { .name = "8250", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16450] = { .name = "16450", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16550] = { .name = "16550", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16550A] = { .name = "16550A", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_CIRRUS] = { .name = "Cirrus", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16650] = { .name = "ST16650", .fifo_size = 1, .tx_loadsz = 1, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16650V2] = { .name = "ST16650V2", .fifo_size = 32, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_00, .rxtrig_bytes = {8, 16, 24, 28}, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16750] = { .name = "TI16750", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR7_64BYTE, .rxtrig_bytes = {1, 16, 32, 56}, .flags = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE, }, [PORT_STARTECH] = { .name = "Startech", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16C950] = { .name = "16C950/954", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {16, 32, 112, 120}, /* UART_CAP_EFR breaks billionon CF bluetooth card. */ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, [PORT_16654] = { .name = "ST16654", .fifo_size = 64, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_10, .rxtrig_bytes = {8, 16, 56, 60}, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16850] = { .name = "XR16850", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_RSA] = { .name = "RSA", .fifo_size = 2048, .tx_loadsz = 2048, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11, .flags = UART_CAP_FIFO, }, [PORT_NS16550A] = { .name = "NS16550A", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_NATSEMI, }, [PORT_XSCALE] = { .name = "XScale", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE, }, [PORT_OCTEON] = { .name = "OCTEON", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO, }, [PORT_U6_16550A] = { .name = "U6_16550A", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_TEGRA] = { .name = "Tegra", .fifo_size = 32, .tx_loadsz = 8, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_01, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO | UART_CAP_RTOIE, }, [PORT_XR17D15X] = { .name = "XR17D15X", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_XR17V35X] = { .name = "XR17V35X", .fifo_size = 256, .tx_loadsz = 256, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 | UART_FCR_T_TRIG_11, .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_LPC3220] = { .name = "LPC3220", .fifo_size = 64, .tx_loadsz = 32, .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00, .flags = UART_CAP_FIFO, }, [PORT_BRCM_TRUMANAGE] = { .name = "TruManage", .fifo_size = 1, .tx_loadsz = 1024, .flags = UART_CAP_HFIFO, }, [PORT_8250_CIR] = { .name = "CIR port" }, [PORT_ALTR_16550_F32] = { .name = "Altera 16550 FIFO32", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 8, 16, 30}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_ALTR_16550_F64] = { .name = "Altera 16550 FIFO64", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 16, 32, 62}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_ALTR_16550_F128] = { .name = "Altera 16550 FIFO128", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 32, 64, 126}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, /* * tx_loadsz is set to 63-bytes instead of 64-bytes to implement * workaround of errata A-008006 which states that tx_loadsz should * be configured less than Maximum supported fifo bytes. */ [PORT_16550A_FSL64] = { .name = "16550A_FSL64", .fifo_size = 64, .tx_loadsz = 63, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR7_64BYTE, .flags = UART_CAP_FIFO | UART_CAP_NOTEMT, }, [PORT_RT2880] = { .name = "Palmchip BK-3103", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_DA830] = { .name = "TI DA8xx/66AK2x", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_MTK_BTIF] = { .name = "MediaTek BTIF", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, .flags = UART_CAP_FIFO, }, [PORT_NPCM] = { .name = "Nuvoton 16550", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_SUNIX] = { .name = "Sunix", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 32, 64, 112}, .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, [PORT_ASPEED_VUART] = { .name = "ASPEED VUART", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_MCHP16550A] = { .name = "MCHP16550A", .fifo_size = 256, .tx_loadsz = 256, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {2, 66, 130, 194}, .flags = UART_CAP_FIFO, }, [PORT_BCM7271] = { .name = "Broadcom BCM7271 UART", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {1, 8, 16, 30}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, }; /* Uart divisor latch read */ static u32 default_serial_dl_read(struct uart_8250_port *up) { /* Assign these in pieces to truncate any bits above 7. */ unsigned char dll = serial_in(up, UART_DLL); unsigned char dlm = serial_in(up, UART_DLM); return dll | dlm << 8; } /* Uart divisor latch write */ static void default_serial_dl_write(struct uart_8250_port *up, u32 value) { serial_out(up, UART_DLL, value & 0xff); serial_out(up, UART_DLM, value >> 8 & 0xff); } #ifdef CONFIG_HAS_IOPORT static unsigned int hub6_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; outb(p->hub6 - 1 + offset, p->iobase); return inb(p->iobase + 1); } static void hub6_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; outb(p->hub6 - 1 + offset, p->iobase); outb(value, p->iobase + 1); } #endif /* CONFIG_HAS_IOPORT */ static unsigned int mem_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readb(p->membase + offset); } static void mem_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writeb(value, p->membase + offset); } static void mem16_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writew(value, p->membase + offset); } static unsigned int mem16_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readw(p->membase + offset); } static void mem32_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writel(value, p->membase + offset); } static unsigned int mem32_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readl(p->membase + offset); } static void mem32be_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; iowrite32be(value, p->membase + offset); } static unsigned int mem32be_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return ioread32be(p->membase + offset); } #ifdef CONFIG_HAS_IOPORT static unsigned int io_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return inb(p->iobase + offset); } static void io_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; outb(value, p->iobase + offset); } #endif static unsigned int no_serial_in(struct uart_port *p, int offset) { return (unsigned int)-1; } static void no_serial_out(struct uart_port *p, int offset, int value) { } static int serial8250_default_handle_irq(struct uart_port *port); static void set_io_from_upio(struct uart_port *p) { struct uart_8250_port *up = up_to_u8250p(p); up->dl_read = default_serial_dl_read; up->dl_write = default_serial_dl_write; switch (p->iotype) { #ifdef CONFIG_HAS_IOPORT case UPIO_HUB6: p->serial_in = hub6_serial_in; p->serial_out = hub6_serial_out; break; #endif case UPIO_MEM: p->serial_in = mem_serial_in; p->serial_out = mem_serial_out; break; case UPIO_MEM16: p->serial_in = mem16_serial_in; p->serial_out = mem16_serial_out; break; case UPIO_MEM32: p->serial_in = mem32_serial_in; p->serial_out = mem32_serial_out; break; case UPIO_MEM32BE: p->serial_in = mem32be_serial_in; p->serial_out = mem32be_serial_out; break; #ifdef CONFIG_HAS_IOPORT case UPIO_PORT: p->serial_in = io_serial_in; p->serial_out = io_serial_out; break; #endif default: WARN(p->iotype != UPIO_PORT || p->iobase, "Unsupported UART type %x\n", p->iotype); p->serial_in = no_serial_in; p->serial_out = no_serial_out; } /* Remember loaded iotype */ up->cur_iotype = p->iotype; p->handle_irq = serial8250_default_handle_irq; } static void serial_port_out_sync(struct uart_port *p, int offset, int value) { switch (p->iotype) { case UPIO_MEM: case UPIO_MEM16: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_AU: p->serial_out(p, offset, value); p->serial_in(p, UART_LCR); /* safe, no side-effects */ break; default: p->serial_out(p, offset, value); } } /* * FIFO support. */ static void serial8250_clear_fifos(struct uart_8250_port *p) { if (p->capabilities & UART_CAP_FIFO) { serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); serial_out(p, UART_FCR, 0); } } static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t); static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t); void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p) { serial8250_clear_fifos(p); serial_out(p, UART_FCR, p->fcr); } EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos); void serial8250_rpm_get(struct uart_8250_port *p) { if (!(p->capabilities & UART_CAP_RPM)) return; pm_runtime_get_sync(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_get); void serial8250_rpm_put(struct uart_8250_port *p) { if (!(p->capabilities & UART_CAP_RPM)) return; pm_runtime_mark_last_busy(p->port.dev); pm_runtime_put_autosuspend(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_put); /** * serial8250_em485_init() - put uart_8250_port into rs485 emulating * @p: uart_8250_port port instance * * The function is used to start rs485 software emulating on the * &struct uart_8250_port* @p. Namely, RTS is switched before/after * transmission. The function is idempotent, so it is safe to call it * multiple times. * * The caller MUST enable interrupt on empty shift register before * calling serial8250_em485_init(). This interrupt is not a part of * 8250 standard, but implementation defined. * * The function is supposed to be called from .rs485_config callback * or from any other callback protected with p->port.lock spinlock. * * See also serial8250_em485_destroy() * * Return 0 - success, -errno - otherwise */ static int serial8250_em485_init(struct uart_8250_port *p) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); if (p->em485) goto deassert_rts; p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC); if (!p->em485) return -ENOMEM; hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; p->em485->port = p; p->em485->active_timer = NULL; p->em485->tx_stopped = true; deassert_rts: if (p->em485->tx_stopped) p->rs485_stop_tx(p); return 0; } /** * serial8250_em485_destroy() - put uart_8250_port into normal state * @p: uart_8250_port port instance * * The function is used to stop rs485 software emulating on the * &struct uart_8250_port* @p. The function is idempotent, so it is safe to * call it multiple times. * * The function is supposed to be called from .rs485_config callback * or from any other callback protected with p->port.lock spinlock. * * See also serial8250_em485_init() */ void serial8250_em485_destroy(struct uart_8250_port *p) { if (!p->em485) return; hrtimer_cancel(&p->em485->start_tx_timer); hrtimer_cancel(&p->em485->stop_tx_timer); kfree(p->em485); p->em485 = NULL; } EXPORT_SYMBOL_GPL(serial8250_em485_destroy); struct serial_rs485 serial8250_em485_supported = { .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; EXPORT_SYMBOL_GPL(serial8250_em485_supported); /** * serial8250_em485_config() - generic ->rs485_config() callback * @port: uart port * @termios: termios structure * @rs485: rs485 settings * * Generic callback usable by 8250 uart drivers to activate rs485 settings * if the uart is incapable of driving RTS as a Transmit Enable signal in * hardware, relying on software emulation instead. */ int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485) { struct uart_8250_port *up = up_to_u8250p(port); /* * Both serial8250_em485_init() and serial8250_em485_destroy() * are idempotent. */ if (rs485->flags & SER_RS485_ENABLED) return serial8250_em485_init(up); serial8250_em485_destroy(up); return 0; } EXPORT_SYMBOL_GPL(serial8250_em485_config); /* * These two wrappers ensure that enable_runtime_pm_tx() can be called more than * once and disable_runtime_pm_tx() will still disable RPM because the fifo is * empty and the HW can idle again. */ void serial8250_rpm_get_tx(struct uart_8250_port *p) { unsigned char rpm_active; if (!(p->capabilities & UART_CAP_RPM)) return; rpm_active = xchg(&p->rpm_tx_active, 1); if (rpm_active) return; pm_runtime_get_sync(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx); void serial8250_rpm_put_tx(struct uart_8250_port *p) { unsigned char rpm_active; if (!(p->capabilities & UART_CAP_RPM)) return; rpm_active = xchg(&p->rpm_tx_active, 0); if (!rpm_active) return; pm_runtime_mark_last_busy(p->port.dev); pm_runtime_put_autosuspend(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx); /* * IER sleep support. UARTs which have EFRs need the "extended * capability" bit enabled. Note that on XR16C850s, we need to * reset LCR to write to IER. */ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) { unsigned char lcr = 0, efr = 0; serial8250_rpm_get(p); if (p->capabilities & UART_CAP_SLEEP) { /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(&p->port); if (p->capabilities & UART_CAP_EFR) { lcr = serial_in(p, UART_LCR); efr = serial_in(p, UART_EFR); serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); serial_out(p, UART_LCR, lcr); } uart_port_unlock_irq(&p->port); } serial8250_rpm_put(p); } static void serial8250_clear_IER(struct uart_8250_port *up) { if (up->capabilities & UART_CAP_UUE) serial_out(up, UART_IER, UART_IER_UUE); else serial_out(up, UART_IER, 0); } #ifdef CONFIG_SERIAL_8250_RSA /* * Attempts to turn on the RSA FIFO. Returns zero on failure. * We set the port uart clock rate if we succeed. */ static int __enable_rsa(struct uart_8250_port *up) { unsigned char mode; int result; mode = serial_in(up, UART_RSA_MSR); result = mode & UART_RSA_MSR_FIFO; if (!result) { serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO); mode = serial_in(up, UART_RSA_MSR); result = mode & UART_RSA_MSR_FIFO; } if (result) up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16; return result; } static void enable_rsa(struct uart_8250_port *up) { if (up->port.type == PORT_RSA) { if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) { uart_port_lock_irq(&up->port); __enable_rsa(up); uart_port_unlock_irq(&up->port); } if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) serial_out(up, UART_RSA_FRR, 0); } } /* * Attempts to turn off the RSA FIFO. Returns zero on failure. * It is unknown why interrupts were disabled in here. However, * the caller is expected to preserve this behaviour by grabbing * the spinlock before calling this function. */ static void disable_rsa(struct uart_8250_port *up) { unsigned char mode; int result; if (up->port.type == PORT_RSA && up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) { uart_port_lock_irq(&up->port); mode = serial_in(up, UART_RSA_MSR); result = !(mode & UART_RSA_MSR_FIFO); if (!result) { serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO); mode = serial_in(up, UART_RSA_MSR); result = !(mode & UART_RSA_MSR_FIFO); } if (result) up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; uart_port_unlock_irq(&up->port); } } #endif /* CONFIG_SERIAL_8250_RSA */ /* * This is a quickie test to see how big the FIFO is. * It doesn't work at all the time, more's the pity. */ static int size_fifo(struct uart_8250_port *up) { unsigned char old_fcr, old_mcr, old_lcr; u32 old_dl; int count; old_lcr = serial_in(up, UART_LCR); serial_out(up, UART_LCR, 0); old_fcr = serial_in(up, UART_FCR); old_mcr = serial8250_in_MCR(up); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); serial8250_out_MCR(up, UART_MCR_LOOP); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); old_dl = serial_dl_read(up); serial_dl_write(up, 0x0001); serial_out(up, UART_LCR, UART_LCR_WLEN8); for (count = 0; count < 256; count++) serial_out(up, UART_TX, count); mdelay(20);/* FIXME - schedule_timeout */ for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) && (count < 256); count++) serial_in(up, UART_RX); serial_out(up, UART_FCR, old_fcr); serial8250_out_MCR(up, old_mcr); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); serial_dl_write(up, old_dl); serial_out(up, UART_LCR, old_lcr); return count; } /* * Read UART ID using the divisor method - set DLL and DLM to zero * and the revision will be in DLL and device type in DLM. We * preserve the device state across this. */ static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p) { unsigned char old_lcr; unsigned int id, old_dl; old_lcr = serial_in(p, UART_LCR); serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A); old_dl = serial_dl_read(p); serial_dl_write(p, 0); id = serial_dl_read(p); serial_dl_write(p, old_dl); serial_out(p, UART_LCR, old_lcr); return id; } /* * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's. * When this function is called we know it is at least a StarTech * 16650 V2, but it might be one of several StarTech UARTs, or one of * its clones. (We treat the broken original StarTech 16650 V1 as a * 16550, and why not? Startech doesn't seem to even acknowledge its * existence.) * * What evil have men's minds wrought... */ static void autoconfig_has_efr(struct uart_8250_port *up) { unsigned int id1, id2, id3, rev; /* * Everything with an EFR has SLEEP */ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; /* * First we check to see if it's an Oxford Semiconductor UART. * * If we have to do this here because some non-National * Semiconductor clone chips lock up if you try writing to the * LSR register (which serial_icr_read does) */ /* * Check for Oxford Semiconductor 16C950. * * EFR [4] must be set else this test fails. * * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca) * claims that it's needed for 952 dual UART's (which are not * recommended for new designs). */ up->acr = 0; serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, UART_EFR_ECB); serial_out(up, UART_LCR, 0x00); id1 = serial_icr_read(up, UART_ID1); id2 = serial_icr_read(up, UART_ID2); id3 = serial_icr_read(up, UART_ID3); rev = serial_icr_read(up, UART_REV); DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev); if (id1 == 0x16 && id2 == 0xC9 && (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { up->port.type = PORT_16C950; /* * Enable work around for the Oxford Semiconductor 952 rev B * chip which causes it to seriously miscalculate baud rates * when DLL is 0. */ if (id3 == 0x52 && rev == 0x01) up->bugs |= UART_BUG_QUOT; return; } /* * We check for a XR16C850 by setting DLL and DLM to 0, and then * reading back DLL and DLM. The chip type depends on the DLM * value read back: * 0x10 - XR16C850 and the DLL contains the chip revision. * 0x12 - XR16C2850. * 0x14 - XR16C854. */ id1 = autoconfig_read_divisor_id(up); DEBUG_AUTOCONF("850id=%04x ", id1); id2 = id1 >> 8; if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { up->port.type = PORT_16850; return; } /* * It wasn't an XR16C850. * * We distinguish between the '654 and the '650 by counting * how many bytes are in the FIFO. I'm using this for now, * since that's the technique that was sent to me in the * serial driver update, but I'm not convinced this works. * I've had problems doing this in the past. -TYT */ if (size_fifo(up) == 64) up->port.type = PORT_16654; else up->port.type = PORT_16650V2; } /* * We detected a chip without a FIFO. Only two fall into * this category - the original 8250 and the 16450. The * 16450 has a scratch register (accessible with LCR=0) */ static void autoconfig_8250(struct uart_8250_port *up) { unsigned char scratch, status1, status2; up->port.type = PORT_8250; scratch = serial_in(up, UART_SCR); serial_out(up, UART_SCR, 0xa5); status1 = serial_in(up, UART_SCR); serial_out(up, UART_SCR, 0x5a); status2 = serial_in(up, UART_SCR); serial_out(up, UART_SCR, scratch); if (status1 == 0xa5 && status2 == 0x5a) up->port.type = PORT_16450; } static int broken_efr(struct uart_8250_port *up) { /* * Exar ST16C2550 "A2" devices incorrectly detect as * having an EFR, and report an ID of 0x0201. See * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html */ if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16) return 1; return 0; } /* * We know that the chip has FIFOs. Does it have an EFR? The * EFR is located in the same register position as the IIR and * we know the top two bits of the IIR are currently set. The * EFR should contain zero. Try to read the EFR. */ static void autoconfig_16550a(struct uart_8250_port *up) { unsigned char status1, status2; unsigned int iersave; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); up->port.type = PORT_16550A; up->capabilities |= UART_CAP_FIFO; if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) && !(up->port.flags & UPF_FULL_PROBE)) return; /* * Check for presence of the EFR when DLAB is set. * Only ST16C650V1 UARTs pass this test. */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); if (serial_in(up, UART_EFR) == 0) { serial_out(up, UART_EFR, 0xA8); if (serial_in(up, UART_EFR) != 0) { DEBUG_AUTOCONF("EFRv1 "); up->port.type = PORT_16650; up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; } else { serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, 0); serial_out(up, UART_LCR, 0); if (status1 == UART_IIR_FIFO_ENABLED_16750) up->port.type = PORT_16550A_FSL64; else DEBUG_AUTOCONF("Motorola 8xxx DUART "); } serial_out(up, UART_EFR, 0); return; } /* * Maybe it requires 0xbf to be written to the LCR. * (other ST16C650V2 UARTs, TI16C752A, etc) */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) { DEBUG_AUTOCONF("EFRv2 "); autoconfig_has_efr(up); return; } /* * Check for a National Semiconductor SuperIO chip. * Attempt to switch to bank 2, read the value of the LOOP bit * from EXCR1. Switch back to bank 0, change it in MCR. Then * switch back to bank 2, read it from EXCR1 again and check * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2 */ serial_out(up, UART_LCR, 0); status1 = serial8250_in_MCR(up); serial_out(up, UART_LCR, 0xE0); status2 = serial_in(up, 0x02); /* EXCR1 */ if (!((status2 ^ status1) & UART_MCR_LOOP)) { serial_out(up, UART_LCR, 0); serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP); serial_out(up, UART_LCR, 0xE0); status2 = serial_in(up, 0x02); /* EXCR1 */ serial_out(up, UART_LCR, 0); serial8250_out_MCR(up, status1); if ((status2 ^ status1) & UART_MCR_LOOP) { unsigned short quot; serial_out(up, UART_LCR, 0xE0); quot = serial_dl_read(up); quot <<= 3; if (ns16550a_goto_highspeed(up)) serial_dl_write(up, quot); serial_out(up, UART_LCR, 0); up->port.uartclk = 921600*16; up->port.type = PORT_NS16550A; up->capabilities |= UART_NATSEMI; return; } } /* * No EFR. Try to detect a TI16750, which only sets bit 5 of * the IIR when 64 byte FIFO mode is enabled when DLAB is set. * Try setting it with and without DLAB set. Cheap clones * set bit 5 without DLAB set. */ serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status2 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(up, UART_LCR, 0); DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2); if (status1 == UART_IIR_FIFO_ENABLED_16550A && status2 == UART_IIR_FIFO_ENABLED_16750) { up->port.type = PORT_16750; up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP; return; } /* * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's * internal UARTs. * We're going to explicitly set the UUE bit to 0 before * trying to write and read a 1 just to make sure it's not * already a 1 and maybe locked there before we even start. */ iersave = serial_in(up, UART_IER); serial_out(up, UART_IER, iersave & ~UART_IER_UUE); if (!(serial_in(up, UART_IER) & UART_IER_UUE)) { /* * OK it's in a known zero state, try writing and reading * without disturbing the current state of the other bits. */ serial_out(up, UART_IER, iersave | UART_IER_UUE); if (serial_in(up, UART_IER) & UART_IER_UUE) { /* * It's an Xscale. * We'll leave the UART_IER_UUE bit set to 1 (enabled). */ DEBUG_AUTOCONF("Xscale "); up->port.type = PORT_XSCALE; up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE; return; } } else { /* * If we got here we couldn't force the IER_UUE bit to 0. * Log it and continue. */ DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 "); } serial_out(up, UART_IER, iersave); /* * We distinguish between 16550A and U6 16550A by counting * how many bytes are in the FIFO. */ if (up->port.type == PORT_16550A && size_fifo(up) == 64) { up->port.type = PORT_U6_16550A; up->capabilities |= UART_CAP_AFE; } } /* * This routine is called by rs_init() to initialize a specific serial * port. It determines what type of UART chip this serial port is * using: 8250, 16450, 16550, 16550A. The important question is * whether or not this UART is a 16550A or not, since this will * determine whether or not we can use its FIFO features or not. */ static void autoconfig(struct uart_8250_port *up) { unsigned char status1, scratch, scratch2, scratch3; unsigned char save_lcr, save_mcr; struct uart_port *port = &up->port; unsigned long flags; unsigned int old_capabilities; if (!port->iobase && !port->mapbase && !port->membase) return; DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ", port->name, port->iobase, port->membase); /* * We really do need global IRQs disabled here - we're going to * be frobbing the chips IRQ enable register to see if it exists. * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->capabilities = 0; up->bugs = 0; if (!(port->flags & UPF_BUGGY_UART)) { /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. * * 0x80 is used as a nonsense port to prevent against * false positives due to ISA bus float. The * assumption is that 0x80 is a non-existent port; * which should be safe since include/asm/io.h also * makes this assumption. * * Note: this is safe as long as MCR bit 4 is clear * and the device is in "PC" mode. */ scratch = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); #if defined(__i386__) && defined(CONFIG_HAS_IOPORT) outb(0xff, 0x080); #endif /* * Mask out IER[7:4] bits for test as some UARTs (e.g. TL * 16C754B) allow only to modify them if an EFR bit is set. */ scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR; serial_out(up, UART_IER, UART_IER_ALL_INTR); #if defined(__i386__) && defined(CONFIG_HAS_IOPORT) outb(0, 0x080); #endif scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR; serial_out(up, UART_IER, scratch); if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) { /* * We failed; there's nothing here */ uart_port_unlock_irqrestore(port, flags); DEBUG_AUTOCONF("IER test failed (%02x, %02x) ", scratch2, scratch3); goto out; } } save_mcr = serial8250_in_MCR(up); save_lcr = serial_in(up, UART_LCR); /* * Check to see if a UART is really there. Certain broken * internal modems based on the Rockwell chipset fail this * test, because they apparently don't implement the loopback * test mode. So this test is skipped on the COM 1 through * COM 4 ports. This *should* be safe, since no board * manufacturer would be stupid enough to design a board * that conflicts with COM 1-4 --- we hope! */ if (!(port->flags & UPF_SKIP_TEST)) { serial8250_out_MCR(up, UART_MCR_LOOP | UART_MCR_OUT2 | UART_MCR_RTS); status1 = serial_in(up, UART_MSR) & UART_MSR_STATUS_BITS; serial8250_out_MCR(up, save_mcr); if (status1 != (UART_MSR_DCD | UART_MSR_CTS)) { uart_port_unlock_irqrestore(port, flags); DEBUG_AUTOCONF("LOOP test failed (%02x) ", status1); goto out; } } /* * We're pretty sure there's a port here. Lets find out what * type of port it is. The IIR top two bits allows us to find * out if it's 8250 or 16450, 16550, 16550A or later. This * determines what we test for next. * * We also initialise the EFR (if any) to zero for later. The * EFR occupies the same register location as the FCR and IIR. */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, 0); serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); switch (serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED) { case UART_IIR_FIFO_ENABLED_8250: autoconfig_8250(up); break; case UART_IIR_FIFO_ENABLED_16550: port->type = PORT_16550; break; case UART_IIR_FIFO_ENABLED_16550A: autoconfig_16550a(up); break; default: port->type = PORT_UNKNOWN; break; } #ifdef CONFIG_SERIAL_8250_RSA /* * Only probe for RSA ports if we got the region. */ if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA && __enable_rsa(up)) port->type = PORT_RSA; #endif serial_out(up, UART_LCR, save_lcr); port->fifosize = uart_config[up->port.type].fifo_size; old_capabilities = up->capabilities; up->capabilities = uart_config[port->type].flags; up->tx_loadsz = uart_config[port->type].tx_loadsz; if (port->type == PORT_UNKNOWN) goto out_unlock; /* * Reset the UART. */ #ifdef CONFIG_SERIAL_8250_RSA if (port->type == PORT_RSA) serial_out(up, UART_RSA_FRR, 0); #endif serial8250_out_MCR(up, save_mcr); serial8250_clear_fifos(up); serial_in(up, UART_RX); serial8250_clear_IER(up); out_unlock: uart_port_unlock_irqrestore(port, flags); /* * Check if the device is a Fintek F81216A */ if (port->type == PORT_16550A && port->iotype == UPIO_PORT) fintek_8250_probe(up); if (up->capabilities != old_capabilities) { dev_warn(port->dev, "detected caps %08x should be %08x\n", old_capabilities, up->capabilities); } out: DEBUG_AUTOCONF("iir=%d ", scratch); DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name); } static void autoconfig_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; unsigned char save_mcr, save_ier; unsigned char save_ICP = 0; unsigned int ICP = 0; unsigned long irqs; int irq; if (port->flags & UPF_FOURPORT) { ICP = (port->iobase & 0xfe0) | 0x1f; save_ICP = inb_p(ICP); outb_p(0x80, ICP); inb_p(ICP); } /* forget possible initially masked and pending IRQ */ probe_irq_off(probe_irq_on()); save_mcr = serial8250_in_MCR(up); /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); save_ier = serial_in(up, UART_IER); uart_port_unlock_irq(port); serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2); irqs = probe_irq_on(); serial8250_out_MCR(up, 0); udelay(10); if (port->flags & UPF_FOURPORT) { serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } else { serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2); } /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); serial_out(up, UART_IER, UART_IER_ALL_INTR); uart_port_unlock_irq(port); serial_in(up, UART_LSR); serial_in(up, UART_RX); serial_in(up, UART_IIR); serial_in(up, UART_MSR); serial_out(up, UART_TX, 0xFF); udelay(20); irq = probe_irq_off(irqs); serial8250_out_MCR(up, save_mcr); /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); serial_out(up, UART_IER, save_ier); uart_port_unlock_irq(port); if (port->flags & UPF_FOURPORT) outb_p(save_ICP, ICP); port->irq = (irq > 0) ? irq : 0; } static void serial8250_stop_rx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); serial8250_rpm_get(up); up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; serial_port_out(port, UART_IER, up->ier); serial8250_rpm_put(up); } /** * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback * @p: uart 8250 port * * Generic callback usable by 8250 uart drivers to stop rs485 transmission. */ void serial8250_em485_stop_tx(struct uart_8250_port *p) { unsigned char mcr = serial8250_in_MCR(p); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND) mcr |= UART_MCR_RTS; else mcr &= ~UART_MCR_RTS; serial8250_out_MCR(p, mcr); /* * Empty the RX FIFO, we are not interested in anything * received during the half-duplex transmission. * Enable previously disabled RX interrupts. */ if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; serial_port_out(&p->port, UART_IER, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) { struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, stop_tx_timer); struct uart_8250_port *p = em485->port; unsigned long flags; serial8250_rpm_get(p); uart_port_lock_irqsave(&p->port, &flags); if (em485->active_timer == &em485->stop_tx_timer) { p->rs485_stop_tx(p); em485->active_timer = NULL; em485->tx_stopped = true; } uart_port_unlock_irqrestore(&p->port, flags); serial8250_rpm_put(p); return HRTIMER_NORESTART; } static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec) { hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL); } static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay) { struct uart_8250_em485 *em485 = p->em485; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC; /* * rs485_stop_tx() is going to set RTS according to config * AND flush RX FIFO if required. */ if (stop_delay > 0) { em485->active_timer = &em485->stop_tx_timer; hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL); } else { p->rs485_stop_tx(p); em485->active_timer = NULL; em485->tx_stopped = true; } } static inline void __stop_tx(struct uart_8250_port *p) { struct uart_8250_em485 *em485 = p->em485; if (em485) { u16 lsr = serial_lsr_in(p); u64 stop_delay = 0; if (!(lsr & UART_LSR_THRE)) return; /* * To provide required timing and allow FIFO transfer, * __stop_tx_rs485() must be called only when both FIFO and * shift register are empty. The device driver should either * enable interrupt on TEMT or set UART_CAP_NOTEMT that will * enlarge stop_tx_timer by the tx time of one frame to cover * for emptying of the shift register. */ if (!(lsr & UART_LSR_TEMT)) { if (!(p->capabilities & UART_CAP_NOTEMT)) return; /* * RTS might get deasserted too early with the normal * frame timing formula. It seems to suggest THRE might * get asserted already during tx of the stop bit * rather than after it is fully sent. * Roughly estimate 1 extra bit here with / 7. */ stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7); } __stop_tx_rs485(p, stop_delay); } if (serial8250_clear_THRI(p)) serial8250_rpm_put_tx(p); } static void serial8250_stop_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); __stop_tx(up); /* * We really want to stop the transmitter from sending. */ if (port->type == PORT_16C950) { up->acr |= UART_ACR_TXDIS; serial_icr_write(up, UART_ACR, up->acr); } serial8250_rpm_put(up); } static inline void __start_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); if (up->dma && !up->dma->tx_dma(up)) return; if (serial8250_set_THRI(up)) { if (up->bugs & UART_BUG_TXEN) { u16 lsr = serial_lsr_in(up); if (lsr & UART_LSR_THRE) serial8250_tx_chars(up); } } /* * Re-enable the transmitter if we disabled it. */ if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) { up->acr &= ~UART_ACR_TXDIS; serial_icr_write(up, UART_ACR, up->acr); } } /** * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback * @up: uart 8250 port * * Generic callback usable by 8250 uart drivers to start rs485 transmission. * Assumes that setting the RTS bit in the MCR register means RTS is high. * (Some chips use inverse semantics.) Further assumes that reception is * stoppable by disabling the UART_IER_RDI interrupt. (Some chips set the * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.) */ void serial8250_em485_start_tx(struct uart_8250_port *up) { unsigned char mcr = serial8250_in_MCR(up); if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX)) serial8250_stop_rx(&up->port); if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND) mcr |= UART_MCR_RTS; else mcr &= ~UART_MCR_RTS; serial8250_out_MCR(up, mcr); } EXPORT_SYMBOL_GPL(serial8250_em485_start_tx); /* Returns false, if start_tx_timer was setup to defer TX start */ static bool start_tx_rs485(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); struct uart_8250_em485 *em485 = up->em485; /* * While serial8250_em485_handle_stop_tx() is a noop if * em485->active_timer != &em485->stop_tx_timer, it might happen that * the timer is still armed and triggers only after the current bunch of * chars is send and em485->active_timer == &em485->stop_tx_timer again. * So cancel the timer. There is still a theoretical race condition if * the timer is already running and only comes around to check for * em485->active_timer when &em485->stop_tx_timer is armed again. */ if (em485->active_timer == &em485->stop_tx_timer) hrtimer_try_to_cancel(&em485->stop_tx_timer); em485->active_timer = NULL; if (em485->tx_stopped) { em485->tx_stopped = false; up->rs485_start_tx(up); if (up->port.rs485.delay_rts_before_send > 0) { em485->active_timer = &em485->start_tx_timer; start_hrtimer_ms(&em485->start_tx_timer, up->port.rs485.delay_rts_before_send); return false; } } return true; } static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t) { struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, start_tx_timer); struct uart_8250_port *p = em485->port; unsigned long flags; uart_port_lock_irqsave(&p->port, &flags); if (em485->active_timer == &em485->start_tx_timer) { __start_tx(&p->port); em485->active_timer = NULL; } uart_port_unlock_irqrestore(&p->port, flags); return HRTIMER_NORESTART; } static void serial8250_start_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); struct uart_8250_em485 *em485 = up->em485; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); if (!port->x_char && kfifo_is_empty(&port->state->port.xmit_fifo)) return; serial8250_rpm_get_tx(up); if (em485) { if ((em485->active_timer == &em485->start_tx_timer) || !start_tx_rs485(port)) return; } __start_tx(port); } static void serial8250_throttle(struct uart_port *port) { port->throttle(port); } static void serial8250_unthrottle(struct uart_port *port) { port->unthrottle(port); } static void serial8250_disable_ms(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); /* no MSR capabilities */ if (up->bugs & UART_BUG_NOMSR) return; mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; serial_port_out(port, UART_IER, up->ier); } static void serial8250_enable_ms(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); /* no MSR capabilities */ if (up->bugs & UART_BUG_NOMSR) return; mctrl_gpio_enable_ms(up->gpios); up->ier |= UART_IER_MSI; serial8250_rpm_get(up); serial_port_out(port, UART_IER, up->ier); serial8250_rpm_put(up); } void serial8250_read_char(struct uart_8250_port *up, u16 lsr) { struct uart_port *port = &up->port; u8 ch, flag = TTY_NORMAL; if (likely(lsr & UART_LSR_DR)) ch = serial_in(up, UART_RX); else /* * Intel 82571 has a Serial Over Lan device that will * set UART_LSR_BI without setting UART_LSR_DR when * it receives a break. To avoid reading from the * receive buffer without UART_LSR_DR bit set, we * just force the read character to be 0 */ ch = 0; port->icount.rx++; lsr |= up->lsr_saved_flags; up->lsr_saved_flags = 0; if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) { if (lsr & UART_LSR_BI) { lsr &= ~(UART_LSR_FE | UART_LSR_PE); port->icount.brk++; /* * We do the SysRQ and SAK checking * here because otherwise the break * may get masked by ignore_status_mask * or read_status_mask. */ if (uart_handle_break(port)) return; } else if (lsr & UART_LSR_PE) port->icount.parity++; else if (lsr & UART_LSR_FE) port->icount.frame++; if (lsr & UART_LSR_OE) port->icount.overrun++; /* * Mask off conditions which should be ignored. */ lsr &= port->read_status_mask; if (lsr & UART_LSR_BI) { dev_dbg(port->dev, "handling break\n"); flag = TTY_BREAK; } else if (lsr & UART_LSR_PE) flag = TTY_PARITY; else if (lsr & UART_LSR_FE) flag = TTY_FRAME; } if (uart_prepare_sysrq_char(port, ch)) return; uart_insert_char(port, lsr, UART_LSR_OE, ch, flag); } EXPORT_SYMBOL_GPL(serial8250_read_char); /* * serial8250_rx_chars - Read characters. The first LSR value must be passed in. * * Returns LSR bits. The caller should rely only on non-Rx related LSR bits * (such as THRE) because the LSR value might come from an already consumed * character. */ u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr) { struct uart_port *port = &up->port; int max_count = 256; do { serial8250_read_char(up, lsr); if (--max_count == 0) break; lsr = serial_in(up, UART_LSR); } while (lsr & (UART_LSR_DR | UART_LSR_BI)); tty_flip_buffer_push(&port->state->port); return lsr; } EXPORT_SYMBOL_GPL(serial8250_rx_chars); void serial8250_tx_chars(struct uart_8250_port *up) { struct uart_port *port = &up->port; struct tty_port *tport = &port->state->port; int count; if (port->x_char) { uart_xchar_out(port, UART_TX); return; } if (uart_tx_stopped(port)) { serial8250_stop_tx(port); return; } if (kfifo_is_empty(&tport->xmit_fifo)) { __stop_tx(up); return; } count = up->tx_loadsz; do { unsigned char c; if (!uart_fifo_get(port, &c)) break; serial_out(up, UART_TX, c); if (up->bugs & UART_BUG_TXRACE) { /* * The Aspeed BMC virtual UARTs have a bug where data * may get stuck in the BMC's Tx FIFO from bursts of * writes on the APB interface. * * Delay back-to-back writes by a read cycle to avoid * stalling the VUART. Read a register that won't have * side-effects and discard the result. */ serial_in(up, UART_SCR); } if ((up->capabilities & UART_CAP_HFIFO) && !uart_lsr_tx_empty(serial_in(up, UART_LSR))) break; /* The BCM2835 MINI UART THRE bit is really a not-full bit. */ if ((up->capabilities & UART_CAP_MINI) && !(serial_in(up, UART_LSR) & UART_LSR_THRE)) break; } while (--count > 0); if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(port); /* * With RPM enabled, we have to wait until the FIFO is empty before the * HW can go idle. So we get here once again with empty FIFO and disable * the interrupt and RPM in __stop_tx() */ if (kfifo_is_empty(&tport->xmit_fifo) && !(up->capabilities & UART_CAP_RPM)) __stop_tx(up); } EXPORT_SYMBOL_GPL(serial8250_tx_chars); /* Caller holds uart port lock */ unsigned int serial8250_modem_status(struct uart_8250_port *up) { struct uart_port *port = &up->port; unsigned int status = serial_in(up, UART_MSR); status |= up->msr_saved_flags; up->msr_saved_flags = 0; if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI && port->state != NULL) { if (status & UART_MSR_TERI) port->icount.rng++; if (status & UART_MSR_DDSR) port->icount.dsr++; if (status & UART_MSR_DDCD) uart_handle_dcd_change(port, status & UART_MSR_DCD); if (status & UART_MSR_DCTS) uart_handle_cts_change(port, status & UART_MSR_CTS); wake_up_interruptible(&port->state->port.delta_msr_wait); } return status; } EXPORT_SYMBOL_GPL(serial8250_modem_status); static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) { switch (iir & 0x3f) { case UART_IIR_THRI: /* * Postpone DMA or not decision to IIR_RDI or IIR_RX_TIMEOUT * because it's impossible to do an informed decision about * that with IIR_THRI. * * This also fixes one known DMA Rx corruption issue where * DR is asserted but DMA Rx only gets a corrupted zero byte * (too early DR?). */ return false; case UART_IIR_RDI: if (!up->dma->rx_running) break; fallthrough; case UART_IIR_RLSI: case UART_IIR_RX_TIMEOUT: serial8250_rx_dma_flush(up); return true; } return up->dma->rx_dma(up); } /* * This handles the interrupt from one port. */ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) { struct uart_8250_port *up = up_to_u8250p(port); struct tty_port *tport = &port->state->port; bool skip_rx = false; unsigned long flags; u16 status; if (iir & UART_IIR_NO_INT) return 0; uart_port_lock_irqsave(port, &flags); status = serial_lsr_in(up); /* * If port is stopped and there are no error conditions in the * FIFO, then don't drain the FIFO, as this may lead to TTY buffer * overflow. Not servicing, RX FIFO would trigger auto HW flow * control when FIFO occupancy reaches preset threshold, thus * halting RX. This only works when auto HW flow control is * available. */ if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) && (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) && !(port->read_status_mask & UART_LSR_DR)) skip_rx = true; if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { struct irq_data *d; d = irq_get_irq_data(port->irq); if (d && irqd_is_wakeup_set(d)) pm_wakeup_event(tport->tty->dev, 0); if (!up->dma || handle_rx_dma(up, iir)) status = serial8250_rx_chars(up, status); } serial8250_modem_status(up); if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) { if (!up->dma || up->dma->tx_err) serial8250_tx_chars(up); else if (!up->dma->tx_running) __stop_tx(up); } uart_unlock_and_check_sysrq_irqrestore(port, flags); return 1; } EXPORT_SYMBOL_GPL(serial8250_handle_irq); static int serial8250_default_handle_irq(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int iir; int ret; serial8250_rpm_get(up); iir = serial_port_in(port, UART_IIR); ret = serial8250_handle_irq(port, iir); serial8250_rpm_put(up); return ret; } /* * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP * have a programmable TX threshold that triggers the THRE interrupt in * the IIR register. In this case, the THRE interrupt indicates the FIFO * has space available. Load it up with tx_loadsz bytes. */ static int serial8250_tx_threshold_handle_irq(struct uart_port *port) { unsigned long flags; unsigned int iir = serial_port_in(port, UART_IIR); /* TX Threshold IRQ triggered so load up FIFO */ if ((iir & UART_IIR_ID) == UART_IIR_THRI) { struct uart_8250_port *up = up_to_u8250p(port); uart_port_lock_irqsave(port, &flags); serial8250_tx_chars(up); uart_port_unlock_irqrestore(port, flags); } iir = serial_port_in(port, UART_IIR); return serial8250_handle_irq(port, iir); } static unsigned int serial8250_tx_empty(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int result = 0; unsigned long flags; serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); if (!serial8250_tx_dma_running(up) && uart_lsr_tx_empty(serial_lsr_in(up))) result = TIOCSER_TEMT; uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); return result; } unsigned int serial8250_do_get_mctrl(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int status; unsigned int val; serial8250_rpm_get(up); status = serial8250_modem_status(up); serial8250_rpm_put(up); val = serial8250_MSR_to_TIOCM(status); if (up->gpios) return mctrl_gpio_get(up->gpios, &val); return val; } EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl); static unsigned int serial8250_get_mctrl(struct uart_port *port) { if (port->get_mctrl) return port->get_mctrl(port); return serial8250_do_get_mctrl(port); } void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_8250_port *up = up_to_u8250p(port); unsigned char mcr; mcr = serial8250_TIOCM_to_MCR(mctrl); mcr |= up->mcr; serial8250_out_MCR(up, mcr); } EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl); static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl) { if (port->rs485.flags & SER_RS485_ENABLED) return; if (port->set_mctrl) port->set_mctrl(port, mctrl); else serial8250_do_set_mctrl(port, mctrl); } static void serial8250_break_ctl(struct uart_port *port, int break_state) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); if (break_state == -1) up->lcr |= UART_LCR_SBC; else up->lcr &= ~UART_LCR_SBC; serial_port_out(port, UART_LCR, up->lcr); uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); } static void wait_for_lsr(struct uart_8250_port *up, int bits) { unsigned int status, tmout = 10000; /* Wait up to 10ms for the character(s) to be sent. */ for (;;) { status = serial_lsr_in(up); if ((status & bits) == bits) break; if (--tmout == 0) break; udelay(1); touch_nmi_watchdog(); } } /* * Wait for transmitter & holding register to empty */ static void wait_for_xmitr(struct uart_8250_port *up, int bits) { unsigned int tmout; wait_for_lsr(up, bits); /* Wait up to 1s for flow control if necessary */ if (up->port.flags & UPF_CONS_FLOW) { for (tmout = 1000000; tmout; tmout--) { unsigned int msr = serial_in(up, UART_MSR); up->msr_saved_flags |= msr & MSR_SAVE_FLAGS; if (msr & UART_MSR_CTS) break; udelay(1); touch_nmi_watchdog(); } } } #ifdef CONFIG_CONSOLE_POLL /* * Console polling routines for writing and reading from the uart while * in an interrupt or debug context. */ static int serial8250_get_poll_char(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); int status; u16 lsr; serial8250_rpm_get(up); lsr = serial_port_in(port, UART_LSR); if (!(lsr & UART_LSR_DR)) { status = NO_POLL_CHAR; goto out; } status = serial_port_in(port, UART_RX); out: serial8250_rpm_put(up); return status; } static void serial8250_put_poll_char(struct uart_port *port, unsigned char c) { unsigned int ier; struct uart_8250_port *up = up_to_u8250p(port); /* * Normally the port is locked to synchronize UART_IER access * against the console. However, this function is only used by * KDB/KGDB, where it may not be possible to acquire the port * lock because all other CPUs are quiesced. The quiescence * should allow safe lockless usage here. */ serial8250_rpm_get(up); /* * First save the IER then disable the interrupts */ ier = serial_port_in(port, UART_IER); serial8250_clear_IER(up); wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); /* * Send the character out. */ serial_port_out(port, UART_TX, c); /* * Finally, wait for transmitter to become empty * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); serial_port_out(port, UART_IER, ier); serial8250_rpm_put(up); } #endif /* CONFIG_CONSOLE_POLL */ int serial8250_do_startup(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; unsigned char iir; int retval; u16 lsr; if (!port->fifosize) port->fifosize = uart_config[port->type].fifo_size; if (!up->tx_loadsz) up->tx_loadsz = uart_config[port->type].tx_loadsz; if (!up->capabilities) up->capabilities = uart_config[port->type].flags; up->mcr = 0; if (port->iotype != up->cur_iotype) set_io_from_upio(port); serial8250_rpm_get(up); if (port->type == PORT_16C950) { /* * Wake up and initialize UART * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->acr = 0; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); serial_port_out(port, UART_IER, 0); serial_port_out(port, UART_LCR, 0); serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); serial_port_out(port, UART_LCR, 0); uart_port_unlock_irqrestore(port, flags); } if (port->type == PORT_DA830) { /* * Reset the port * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); serial_port_out(port, UART_IER, 0); serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); uart_port_unlock_irqrestore(port, flags); mdelay(10); /* Enable Tx, Rx and free run mode */ serial_port_out(port, UART_DA830_PWREMU_MGMT, UART_DA830_PWREMU_MGMT_UTRST | UART_DA830_PWREMU_MGMT_URRST | UART_DA830_PWREMU_MGMT_FREE); } #ifdef CONFIG_SERIAL_8250_RSA /* * If this is an RSA port, see if we can kick it up to the * higher speed clock. */ enable_rsa(up); #endif /* * Clear the FIFO buffers and disable them. * (they will be reenabled in set_termios()) */ serial8250_clear_fifos(up); /* * Clear the interrupt registers. */ serial_port_in(port, UART_LSR); serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); /* * At this point, there's no way the LSR could still be 0xff; * if it is, then bail out, because there's likely no UART * here. */ if (!(port->flags & UPF_BUGGY_UART) && (serial_port_in(port, UART_LSR) == 0xff)) { dev_info_ratelimited(port->dev, "LSR safety check engaged!\n"); retval = -ENODEV; goto out; } /* * For a XR16C850, we need to set the trigger levels */ if (port->type == PORT_16850) { unsigned char fctr; serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX); serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TRGD | UART_FCTR_RX); serial_port_out(port, UART_TRG, UART_TRG_96); serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TRGD | UART_FCTR_TX); serial_port_out(port, UART_TRG, UART_TRG_96); serial_port_out(port, UART_LCR, 0); } /* * For the Altera 16550 variants, set TX threshold trigger level. */ if (((port->type == PORT_ALTR_16550_F32) || (port->type == PORT_ALTR_16550_F64) || (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) { /* Bounds checking of TX threshold (valid 0 to fifosize-2) */ if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) { dev_err(port->dev, "TX FIFO Threshold errors, skipping\n"); } else { serial_port_out(port, UART_ALTR_AFR, UART_ALTR_EN_TXFIFO_LW); serial_port_out(port, UART_ALTR_TX_LOW, port->fifosize - up->tx_loadsz); port->handle_irq = serial8250_tx_threshold_handle_irq; } } /* Check if we need to have shared IRQs */ if (port->irq && (up->port.flags & UPF_SHARE_IRQ)) up->port.irqflags |= IRQF_SHARED; retval = up->ops->setup_irq(up); if (retval) goto out; if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; if (port->irqflags & IRQF_SHARED) disable_irq_nosync(port->irq); /* * Test for UARTs that do not reassert THRE when the * transmitter is idle and the interrupt has already * been cleared. Real 16550s should always reassert * this interrupt whenever the transmitter is idle and * the interrupt is enabled. Delays are necessary to * allow register changes to become visible. * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); wait_for_xmitr(up, UART_LSR_THRE); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow THRE to set */ iir1 = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow a working UART time to re-assert THRE */ iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); uart_port_unlock_irqrestore(port, flags); if (port->irqflags & IRQF_SHARED) enable_irq(port->irq); /* * If the interrupt is not reasserted, or we otherwise * don't trust the iir, setup a timer to kick the UART * on a regular basis. */ if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) || up->port.flags & UPF_BUG_THRE) { up->bugs |= UART_BUG_THRE; } } up->ops->setup_timer(up); /* * Now, initialize the UART */ serial_port_out(port, UART_LCR, UART_LCR_WLEN8); uart_port_lock_irqsave(port, &flags); if (up->port.flags & UPF_FOURPORT) { if (!up->port.irq) up->port.mctrl |= TIOCM_OUT1; } else /* * Most PC uarts need OUT2 raised to enable interrupts. */ if (port->irq) up->port.mctrl |= TIOCM_OUT2; serial8250_set_mctrl(port, port->mctrl); /* * Serial over Lan (SoL) hack: * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be * used for Serial Over Lan. Those chips take a longer time than a * normal serial device to signalize that a transmission data was * queued. Due to that, the above test generally fails. One solution * would be to delay the reading of iir. However, this is not * reliable, since the timeout is variable. So, let's just don't * test if we receive TX irq. This way, we'll never enable * UART_BUG_TXEN. */ if (up->port.quirks & UPQ_NO_TXEN_TEST) goto dont_test_tx_en; /* * Do a quick test to see if we receive an interrupt when we enable * the TX irq. */ serial_port_out(port, UART_IER, UART_IER_THRI); lsr = serial_port_in(port, UART_LSR); iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { if (!(up->bugs & UART_BUG_TXEN)) { up->bugs |= UART_BUG_TXEN; dev_dbg(port->dev, "enabling bad tx status workarounds\n"); } } else { up->bugs &= ~UART_BUG_TXEN; } dont_test_tx_en: uart_port_unlock_irqrestore(port, flags); /* * Clear the interrupt registers again for luck, and clear the * saved flags to avoid getting false values from polling * routines or the previous session. */ serial_port_in(port, UART_LSR); serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); up->lsr_saved_flags = 0; up->msr_saved_flags = 0; /* * Request DMA channels for both RX and TX. */ if (up->dma) { const char *msg = NULL; if (uart_console(port)) msg = "forbid DMA for kernel console"; else if (serial8250_request_dma(up)) msg = "failed to request DMA"; if (msg) { dev_warn_ratelimited(port->dev, "%s\n", msg); up->dma = NULL; } } /* * Set the IER shadow for rx interrupts but defer actual interrupt * enable until after the FIFOs are enabled; otherwise, an already- * active sender can swamp the interrupt handler with "too much work". */ up->ier = UART_IER_RLSI | UART_IER_RDI; if (port->flags & UPF_FOURPORT) { unsigned int icp; /* * Enable interrupts on the AST Fourport board */ icp = (port->iobase & 0xfe0) | 0x01f; outb_p(0x80, icp); inb_p(icp); } retval = 0; out: serial8250_rpm_put(up); return retval; } EXPORT_SYMBOL_GPL(serial8250_do_startup); static int serial8250_startup(struct uart_port *port) { if (port->startup) return port->startup(port); return serial8250_do_startup(port); } void serial8250_do_shutdown(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; serial8250_rpm_get(up); /* * Disable interrupts from this port * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->ier = 0; serial_port_out(port, UART_IER, 0); uart_port_unlock_irqrestore(port, flags); synchronize_irq(port->irq); if (up->dma) serial8250_release_dma(up); uart_port_lock_irqsave(port, &flags); if (port->flags & UPF_FOURPORT) { /* reset interrupts on the AST Fourport board */ inb((port->iobase & 0xfe0) | 0x1f); port->mctrl |= TIOCM_OUT1; } else port->mctrl &= ~TIOCM_OUT2; serial8250_set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); /* * Disable break condition and FIFOs */ serial_port_out(port, UART_LCR, serial_port_in(port, UART_LCR) & ~UART_LCR_SBC); serial8250_clear_fifos(up); #ifdef CONFIG_SERIAL_8250_RSA /* * Reset the RSA board back to 115kbps compat mode. */ disable_rsa(up); #endif /* * Read data port to reset things, and then unlink from * the IRQ chain. */ serial_port_in(port, UART_RX); serial8250_rpm_put(up); up->ops->release_irq(up); } EXPORT_SYMBOL_GPL(serial8250_do_shutdown); static void serial8250_shutdown(struct uart_port *port) { if (port->shutdown) port->shutdown(port); else serial8250_do_shutdown(port); } static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) { upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER; struct uart_8250_port *up = up_to_u8250p(port); unsigned int quot; /* * Handle magic divisors for baud rates above baud_base on SMSC * Super I/O chips. We clamp custom rates from clk/6 and clk/12 * up to clk/4 (0x8001) and clk/8 (0x8002) respectively. These * magic divisors actually reprogram the baud rate generator's * reference clock derived from chips's 14.318MHz clock input. * * Documentation claims that with these magic divisors the base * frequencies of 7.3728MHz and 3.6864MHz are used respectively * for the extra baud rates of 460800bps and 230400bps rather * than the usual base frequency of 1.8462MHz. However empirical * evidence contradicts that. * * Instead bit 7 of the DLM register (bit 15 of the divisor) is * effectively used as a clock prescaler selection bit for the * base frequency of 7.3728MHz, always used. If set to 0, then * the base frequency is divided by 4 for use by the Baud Rate * Generator, for the usual arrangement where the value of 1 of * the divisor produces the baud rate of 115200bps. Conversely, * if set to 1 and high-speed operation has been enabled with the * Serial Port Mode Register in the Device Configuration Space, * then the base frequency is supplied directly to the Baud Rate * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003, * 0x8004, etc. the respective baud rates produced are 460800bps, * 230400bps, 153600bps, 115200bps, etc. * * In all cases only low 15 bits of the divisor are used to divide * the baud base and therefore 32767 is the maximum divisor value * possible, even though documentation says that the programmable * Baud Rate Generator is capable of dividing the internal PLL * clock by any divisor from 1 to 65535. */ if (magic_multiplier && baud >= port->uartclk / 6) quot = 0x8001; else if (magic_multiplier && baud >= port->uartclk / 12) quot = 0x8002; else quot = uart_get_divisor(port, baud); /* * Oxford Semi 952 rev B workaround */ if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) quot++; return quot; } static unsigned int serial8250_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) { if (port->get_divisor) return port->get_divisor(port, baud, frac); return serial8250_do_get_divisor(port, baud, frac); } static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, tcflag_t c_cflag) { unsigned char cval; cval = UART_LCR_WLEN(tty_get_char_size(c_cflag)); if (c_cflag & CSTOPB) cval |= UART_LCR_STOP; if (c_cflag & PARENB) cval |= UART_LCR_PARITY; if (!(c_cflag & PARODD)) cval |= UART_LCR_EPAR; if (c_cflag & CMSPAR) cval |= UART_LCR_SPAR; return cval; } void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot) { struct uart_8250_port *up = up_to_u8250p(port); /* Workaround to enable 115200 baud on OMAP1510 internal ports */ if (is_omap1510_8250(up)) { if (baud == 115200) { quot = 1; serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1); } else serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0); } /* * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2, * otherwise just set DLAB */ if (up->capabilities & UART_NATSEMI) serial_port_out(port, UART_LCR, 0xe0); else serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB); serial_dl_write(up, quot); } EXPORT_SYMBOL_GPL(serial8250_do_set_divisor); static void serial8250_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot, unsigned int quot_frac) { if (port->set_divisor) port->set_divisor(port, baud, quot, quot_frac); else serial8250_do_set_divisor(port, baud, quot); } static unsigned int serial8250_get_baud_rate(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { unsigned int tolerance = port->uartclk / 100; unsigned int min; unsigned int max; /* * Handle magic divisors for baud rates above baud_base on SMSC * Super I/O chips. Enable custom rates of clk/4 and clk/8, but * disable divisor values beyond 32767, which are unavailable. */ if (port->flags & UPF_MAGIC_MULTIPLIER) { min = port->uartclk / 16 / UART_DIV_MAX >> 1; max = (port->uartclk + tolerance) / 4; } else { min = port->uartclk / 16 / UART_DIV_MAX; max = (port->uartclk + tolerance) / 16; } /* * Ask the core to calculate the divisor for us. * Allow 1% tolerance at the upper limit so uart clks marginally * slower than nominal still match standard baud rates without * causing transmission errors. */ return uart_get_baud_rate(port, termios, old, min, max); } /* * Note in order to avoid the tty port mutex deadlock don't use the next method * within the uart port callbacks. Primarily it's supposed to be utilized to * handle a sudden reference clock rate change. */ void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk) { struct tty_port *tport = &port->state->port; struct tty_struct *tty; tty = tty_port_tty_get(tport); if (!tty) { mutex_lock(&tport->mutex); port->uartclk = uartclk; mutex_unlock(&tport->mutex); return; } down_write(&tty->termios_rwsem); mutex_lock(&tport->mutex); if (port->uartclk == uartclk) goto out_unlock; port->uartclk = uartclk; if (!tty_port_initialized(tport)) goto out_unlock; serial8250_do_set_termios(port, &tty->termios, NULL); out_unlock: mutex_unlock(&tport->mutex); up_write(&tty->termios_rwsem); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(serial8250_update_uartclk); void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_8250_port *up = up_to_u8250p(port); unsigned char cval; unsigned long flags; unsigned int baud, quot, frac = 0; if (up->capabilities & UART_CAP_MINI) { termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR); if ((termios->c_cflag & CSIZE) == CS5 || (termios->c_cflag & CSIZE) == CS6) termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7; } cval = serial8250_compute_lcr(up, termios->c_cflag); baud = serial8250_get_baud_rate(port, termios, old); quot = serial8250_get_divisor(port, baud, &frac); /* * Ok, we're now changing the port state. Do it with * interrupts disabled. * * Synchronize UART_IER access against the console. */ serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); up->lcr = cval; /* Save computed LCR */ if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) { if (baud < 2400 && !up->dma) { up->fcr &= ~UART_FCR_TRIGGER_MASK; up->fcr |= UART_FCR_TRIGGER_1; } } /* * MCR-based auto flow control. When AFE is enabled, RTS will be * deasserted when the receive FIFO contains more characters than * the trigger, or the MCR RTS bit is cleared. */ if (up->capabilities & UART_CAP_AFE) { up->mcr &= ~UART_MCR_AFE; if (termios->c_cflag & CRTSCTS) up->mcr |= UART_MCR_AFE; } /* * Update the per-port timeout. */ uart_update_timeout(port, termios->c_cflag, baud); port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; if (termios->c_iflag & INPCK) port->read_status_mask |= UART_LSR_FE | UART_LSR_PE; if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) port->read_status_mask |= UART_LSR_BI; /* * Characters to ignore */ port->ignore_status_mask = 0; if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; if (termios->c_iflag & IGNBRK) { port->ignore_status_mask |= UART_LSR_BI; /* * If we're ignoring parity and break indicators, * ignore overruns too (for real raw support). */ if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART_LSR_OE; } /* * ignore all characters if CREAD is not set */ if ((termios->c_cflag & CREAD) == 0) port->ignore_status_mask |= UART_LSR_DR; /* * CTS flow control flag and modem status interrupts */ up->ier &= ~UART_IER_MSI; if (!(up->bugs & UART_BUG_NOMSR) && UART_ENABLE_MS(&up->port, termios->c_cflag)) up->ier |= UART_IER_MSI; if (up->capabilities & UART_CAP_UUE) up->ier |= UART_IER_UUE; if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; serial_port_out(port, UART_IER, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; /* * TI16C752/Startech hardware flow control. FIXME: * - TI16C752 requires control thresholds to be set. * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled. */ if (termios->c_cflag & CRTSCTS) efr |= UART_EFR_CTS; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); if (port->flags & UPF_EXAR_EFR) serial_port_out(port, UART_XR_EFR, efr); else serial_port_out(port, UART_EFR, efr); } serial8250_set_divisor(port, baud, quot, frac); /* * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR * is written without DLAB set, this mode will be disabled. */ if (port->type == PORT_16750) serial_port_out(port, UART_FCR, up->fcr); serial_port_out(port, UART_LCR, up->lcr); /* reset DLAB */ if (port->type != PORT_16750) { /* emulated UARTs (Lucent Venus 167x) need two steps */ if (up->fcr & UART_FCR_ENABLE_FIFO) serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO); serial_port_out(port, UART_FCR, up->fcr); /* set fcr */ } serial8250_set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); /* Don't rewrite B0 */ if (tty_termios_baud_rate(termios)) tty_termios_encode_baud_rate(termios, baud, baud); } EXPORT_SYMBOL(serial8250_do_set_termios); static void serial8250_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { if (port->set_termios) port->set_termios(port, termios, old); else serial8250_do_set_termios(port, termios, old); } void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios) { if (termios->c_line == N_PPS) { port->flags |= UPF_HARDPPS_CD; uart_port_lock_irq(port); serial8250_enable_ms(port); uart_port_unlock_irq(port); } else { port->flags &= ~UPF_HARDPPS_CD; if (!UART_ENABLE_MS(port, termios->c_cflag)) { uart_port_lock_irq(port); serial8250_disable_ms(port); uart_port_unlock_irq(port); } } } EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc); static void serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios) { if (port->set_ldisc) port->set_ldisc(port, termios); else serial8250_do_set_ldisc(port, termios); } void serial8250_do_pm(struct uart_port *port, unsigned int state, unsigned int oldstate) { struct uart_8250_port *p = up_to_u8250p(port); serial8250_set_sleep(p, state != 0); } EXPORT_SYMBOL(serial8250_do_pm); static void serial8250_pm(struct uart_port *port, unsigned int state, unsigned int oldstate) { if (port->pm) port->pm(port, state, oldstate); else serial8250_do_pm(port, state, oldstate); } static unsigned int serial8250_port_size(struct uart_8250_port *pt) { if (pt->port.mapsize) return pt->port.mapsize; if (is_omap1_8250(pt)) return 0x16 << pt->port.regshift; return 8 << pt->port.regshift; } /* * Resource handling. */ static int serial8250_request_std_resource(struct uart_8250_port *up) { unsigned int size = serial8250_port_size(up); struct uart_port *port = &up->port; int ret = 0; switch (port->iotype) { case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_MEM16: case UPIO_MEM: if (!port->mapbase) { ret = -EINVAL; break; } if (!request_mem_region(port->mapbase, size, "serial")) { ret = -EBUSY; break; } if (port->flags & UPF_IOREMAP) { port->membase = ioremap(port->mapbase, size); if (!port->membase) { release_mem_region(port->mapbase, size); ret = -ENOMEM; } } break; case UPIO_HUB6: case UPIO_PORT: if (!request_region(port->iobase, size, "serial")) ret = -EBUSY; break; } return ret; } static void serial8250_release_std_resource(struct uart_8250_port *up) { unsigned int size = serial8250_port_size(up); struct uart_port *port = &up->port; switch (port->iotype) { case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_MEM16: case UPIO_MEM: if (!port->mapbase) break; if (port->flags & UPF_IOREMAP) { iounmap(port->membase); port->membase = NULL; } release_mem_region(port->mapbase, size); break; case UPIO_HUB6: case UPIO_PORT: release_region(port->iobase, size); break; } } static void serial8250_release_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); serial8250_release_std_resource(up); } static int serial8250_request_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); return serial8250_request_std_resource(up); } static int fcr_get_rxtrig_bytes(struct uart_8250_port *up) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; unsigned char bytes; bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)]; return bytes ? bytes : -EOPNOTSUPP; } static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; int i; if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)]) return -EOPNOTSUPP; for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) { if (bytes < conf_type->rxtrig_bytes[i]) /* Use the nearest lower value */ return (--i) << UART_FCR_R_TRIG_SHIFT; } return UART_FCR_R_TRIG_11; } static int do_get_rxtrig(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = state->uart_port; struct uart_8250_port *up = up_to_u8250p(uport); if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) return -EINVAL; return fcr_get_rxtrig_bytes(up); } static int do_serial8250_get_rxtrig(struct tty_port *port) { int rxtrig_bytes; mutex_lock(&port->mutex); rxtrig_bytes = do_get_rxtrig(port); mutex_unlock(&port->mutex); return rxtrig_bytes; } static ssize_t rx_trig_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tty_port *port = dev_get_drvdata(dev); int rxtrig_bytes; rxtrig_bytes = do_serial8250_get_rxtrig(port); if (rxtrig_bytes < 0) return rxtrig_bytes; return sysfs_emit(buf, "%d\n", rxtrig_bytes); } static int do_set_rxtrig(struct tty_port *port, unsigned char bytes) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = state->uart_port; struct uart_8250_port *up = up_to_u8250p(uport); int rxtrig; if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) return -EINVAL; rxtrig = bytes_to_fcr_rxtrig(up, bytes); if (rxtrig < 0) return rxtrig; serial8250_clear_fifos(up); up->fcr &= ~UART_FCR_TRIGGER_MASK; up->fcr |= (unsigned char)rxtrig; serial_out(up, UART_FCR, up->fcr); return 0; } static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes) { int ret; mutex_lock(&port->mutex); ret = do_set_rxtrig(port, bytes); mutex_unlock(&port->mutex); return ret; } static ssize_t rx_trig_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct tty_port *port = dev_get_drvdata(dev); unsigned char bytes; int ret; if (!count) return -EINVAL; ret = kstrtou8(buf, 10, &bytes); if (ret < 0) return ret; ret = do_serial8250_set_rxtrig(port, bytes); if (ret < 0) return ret; return count; } static DEVICE_ATTR_RW(rx_trig_bytes); static struct attribute *serial8250_dev_attrs[] = { &dev_attr_rx_trig_bytes.attr, NULL }; static struct attribute_group serial8250_dev_attr_group = { .attrs = serial8250_dev_attrs, }; static void register_dev_spec_attr_grp(struct uart_8250_port *up) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; if (conf_type->rxtrig_bytes[0]) up->port.attr_group = &serial8250_dev_attr_group; } static void serial8250_config_port(struct uart_port *port, int flags) { struct uart_8250_port *up = up_to_u8250p(port); int ret; /* * Find the region that we can probe for. This in turn * tells us whether we can probe for the type of port. */ ret = serial8250_request_std_resource(up); if (ret < 0) return; if (port->iotype != up->cur_iotype) set_io_from_upio(port); if (flags & UART_CONFIG_TYPE) autoconfig(up); /* HW bugs may trigger IRQ while IIR == NO_INT */ if (port->type == PORT_TEGRA) up->bugs |= UART_BUG_NOMSR; if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ) autoconfig_irq(up); if (port->type == PORT_UNKNOWN) serial8250_release_std_resource(up); register_dev_spec_attr_grp(up); up->fcr = uart_config[up->port.type].fcr; } static int serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) { if (ser->irq >= irq_get_nr_irqs() || ser->irq < 0 || ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS || ser->type == PORT_STARTECH) return -EINVAL; return 0; } static const char *serial8250_type(struct uart_port *port) { int type = port->type; if (type >= ARRAY_SIZE(uart_config)) type = 0; return uart_config[type].name; } static const struct uart_ops serial8250_pops = { .tx_empty = serial8250_tx_empty, .set_mctrl = serial8250_set_mctrl, .get_mctrl = serial8250_get_mctrl, .stop_tx = serial8250_stop_tx, .start_tx = serial8250_start_tx, .throttle = serial8250_throttle, .unthrottle = serial8250_unthrottle, .stop_rx = serial8250_stop_rx, .enable_ms = serial8250_enable_ms, .break_ctl = serial8250_break_ctl, .startup = serial8250_startup, .shutdown = serial8250_shutdown, .set_termios = serial8250_set_termios, .set_ldisc = serial8250_set_ldisc, .pm = serial8250_pm, .type = serial8250_type, .release_port = serial8250_release_port, .request_port = serial8250_request_port, .config_port = serial8250_config_port, .verify_port = serial8250_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_get_char = serial8250_get_poll_char, .poll_put_char = serial8250_put_poll_char, #endif }; void serial8250_init_port(struct uart_8250_port *up) { struct uart_port *port = &up->port; spin_lock_init(&port->lock); port->ctrl_id = 0; port->pm = NULL; port->ops = &serial8250_pops; port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); up->cur_iotype = 0xFF; } EXPORT_SYMBOL_GPL(serial8250_init_port); void serial8250_set_defaults(struct uart_8250_port *up) { struct uart_port *port = &up->port; if (up->port.flags & UPF_FIXED_TYPE) { unsigned int type = up->port.type; if (!up->port.fifosize) up->port.fifosize = uart_config[type].fifo_size; if (!up->tx_loadsz) up->tx_loadsz = uart_config[type].tx_loadsz; if (!up->capabilities) up->capabilities = uart_config[type].flags; } set_io_from_upio(port); /* default dma handlers */ if (up->dma) { if (!up->dma->tx_dma) up->dma->tx_dma = serial8250_tx_dma; if (!up->dma->rx_dma) up->dma->rx_dma = serial8250_rx_dma; } } EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) { struct uart_8250_port *up = up_to_u8250p(port); wait_for_xmitr(up, UART_LSR_THRE); serial_port_out(port, UART_TX, ch); } /* * Restore serial console when h/w power-off detected */ static void serial8250_console_restore(struct uart_8250_port *up) { struct uart_port *port = &up->port; struct ktermios termios; unsigned int baud, quot, frac = 0; termios.c_cflag = port->cons->cflag; termios.c_ispeed = port->cons->ispeed; termios.c_ospeed = port->cons->ospeed; if (port->state->port.tty && termios.c_cflag == 0) { termios.c_cflag = port->state->port.tty->termios.c_cflag; termios.c_ispeed = port->state->port.tty->termios.c_ispeed; termios.c_ospeed = port->state->port.tty->termios.c_ospeed; } baud = serial8250_get_baud_rate(port, &termios, NULL); quot = serial8250_get_divisor(port, baud, &frac); serial8250_set_divisor(port, baud, quot, frac); serial_port_out(port, UART_LCR, up->lcr); serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } /* * Print a string to the serial port using the device FIFO * * It sends fifosize bytes and then waits for the fifo * to get empty. */ static void serial8250_console_fifo_write(struct uart_8250_port *up, const char *s, unsigned int count) { int i; const char *end = s + count; unsigned int fifosize = up->tx_loadsz; bool cr_sent = false; while (s != end) { wait_for_lsr(up, UART_LSR_THRE); for (i = 0; i < fifosize && s != end; ++i) { if (*s == '\n' && !cr_sent) { serial_out(up, UART_TX, '\r'); cr_sent = true; } else { serial_out(up, UART_TX, *s++); cr_sent = false; } } } } /* * Print a string to the serial port trying not to disturb * any possible real use of the port... * * The console_lock must be held when we get here. * * Doing runtime PM is really a bad idea for the kernel console. * Thus, we assume the function is called when device is powered up. */ void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count) { struct uart_8250_em485 *em485 = up->em485; struct uart_port *port = &up->port; unsigned long flags; unsigned int ier, use_fifo; int locked = 1; touch_nmi_watchdog(); if (oops_in_progress) locked = uart_port_trylock_irqsave(port, &flags); else uart_port_lock_irqsave(port, &flags); /* * First save the IER then disable the interrupts */ ier = serial_port_in(port, UART_IER); serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { serial8250_console_restore(up); up->canary = 0; } if (em485) { if (em485->tx_stopped) up->rs485_start_tx(up); mdelay(port->rs485.delay_rts_before_send); } use_fifo = (up->capabilities & UART_CAP_FIFO) && /* * BCM283x requires to check the fifo * after each byte. */ !(up->capabilities & UART_CAP_MINI) && /* * tx_loadsz contains the transmit fifo size */ up->tx_loadsz > 1 && (up->fcr & UART_FCR_ENABLE_FIFO) && port->state && test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) && /* * After we put a data in the fifo, the controller will send * it regardless of the CTS state. Therefore, only use fifo * if we don't use control flow. */ !(up->port.flags & UPF_CONS_FLOW); if (likely(use_fifo)) serial8250_console_fifo_write(up, s, count); else uart_console_write(port, s, count, serial8250_console_putchar); /* * Finally, wait for transmitter to become empty * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); if (em485) { mdelay(port->rs485.delay_rts_after_send); if (em485->tx_stopped) up->rs485_stop_tx(up); } serial_port_out(port, UART_IER, ier); /* * The receive handling will happen properly because the * receive ready bit will still be set; it is not cleared * on read. However, modem control will not, we must * call it if we have saved something in the saved flags * while processing with interrupts off. */ if (up->msr_saved_flags) serial8250_modem_status(up); if (locked) uart_port_unlock_irqrestore(port, flags); } static unsigned int probe_baud(struct uart_port *port) { unsigned char lcr, dll, dlm; unsigned int quot; lcr = serial_port_in(port, UART_LCR); serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB); dll = serial_port_in(port, UART_DLL); dlm = serial_port_in(port, UART_DLM); serial_port_out(port, UART_LCR, lcr); quot = (dlm << 8) | dll; return (port->uartclk / 16) / quot; } int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { int baud = 9600; int bits = 8; int parity = 'n'; int flow = 'n'; int ret; if (!port->iobase && !port->membase) return -ENODEV; if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) baud = probe_baud(port); ret = uart_set_options(port, port->cons, baud, parity, bits, flow); if (ret) return ret; if (port->dev) pm_runtime_get_sync(port->dev); return 0; } int serial8250_console_exit(struct uart_port *port) { if (port->dev) pm_runtime_put_sync(port->dev); return 0; } #endif /* CONFIG_SERIAL_8250_CONSOLE */ MODULE_DESCRIPTION("Base port operations for 8250/16550-type serial ports"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #ifndef RXE_HDR_H #define RXE_HDR_H /* extracted information about a packet carried in an sk_buff struct fits in * the skbuff cb array. Must be at most 48 bytes. stored in control block of * sk_buff for received packets. */ struct rxe_pkt_info { struct rxe_dev *rxe; /* device that owns packet */ struct rxe_qp *qp; /* qp that owns packet */ struct rxe_send_wqe *wqe; /* send wqe */ u8 *hdr; /* points to bth */ u32 mask; /* useful info about pkt */ u32 psn; /* bth psn of packet */ u16 pkey_index; /* partition of pkt */ u16 paylen; /* length of bth - icrc */ u8 port_num; /* port pkt received on */ u8 opcode; /* bth opcode of packet */ }; /* Macros should be used only for received skb */ static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb)); return (void *)skb->cb; } static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt) { return container_of((void *)pkt, struct sk_buff, cb); } /* * IBA header types and methods * * Some of these are for reference and completeness only since * rxe does not currently support RD transport * most of this could be moved into IB core. ib_pack.h has * part of this but is incomplete * * Header specific routines to insert/extract values to/from headers * the routines that are named __hhh_(set_)fff() take a pointer to a * hhh header and get(set) the fff field. The routines named * hhh_(set_)fff take a packet info struct and find the * header and field based on the opcode in the packet. * Conversion to/from network byte order from cpu order is also done. */ #define RXE_ICRC_SIZE (4) #define RXE_MAX_HDR_LENGTH (80) /****************************************************************************** * Base Transport Header ******************************************************************************/ struct rxe_bth { u8 opcode; u8 flags; __be16 pkey; __be32 qpn; __be32 apsn; }; #define BTH_TVER (0) #define BTH_DEF_PKEY (0xffff) #define BTH_SE_MASK (0x80) #define BTH_MIG_MASK (0x40) #define BTH_PAD_MASK (0x30) #define BTH_TVER_MASK (0x0f) #define BTH_FECN_MASK (0x80000000) #define BTH_BECN_MASK (0x40000000) #define BTH_RESV6A_MASK (0x3f000000) #define BTH_QPN_MASK (0x00ffffff) #define BTH_ACK_MASK (0x80000000) #define BTH_RESV7_MASK (0x7f000000) #define BTH_PSN_MASK (0x00ffffff) static inline u8 __bth_opcode(void *arg) { struct rxe_bth *bth = arg; return bth->opcode; } static inline void __bth_set_opcode(void *arg, u8 opcode) { struct rxe_bth *bth = arg; bth->opcode = opcode; } static inline u8 __bth_se(void *arg) { struct rxe_bth *bth = arg; return 0 != (BTH_SE_MASK & bth->flags); } static inline void __bth_set_se(void *arg, int se) { struct rxe_bth *bth = arg; if (se) bth->flags |= BTH_SE_MASK; else bth->flags &= ~BTH_SE_MASK; } static inline u8 __bth_mig(void *arg) { struct rxe_bth *bth = arg; return 0 != (BTH_MIG_MASK & bth->flags); } static inline void __bth_set_mig(void *arg, u8 mig) { struct rxe_bth *bth = arg; if (mig) bth->flags |= BTH_MIG_MASK; else bth->flags &= ~BTH_MIG_MASK; } static inline u8 __bth_pad(void *arg) { struct rxe_bth *bth = arg; return (BTH_PAD_MASK & bth->flags) >> 4; } static inline void __bth_set_pad(void *arg, u8 pad) { struct rxe_bth *bth = arg; bth->flags = (BTH_PAD_MASK & (pad << 4)) | (~BTH_PAD_MASK & bth->flags); } static inline u8 __bth_tver(void *arg) { struct rxe_bth *bth = arg; return BTH_TVER_MASK & bth->flags; } static inline void __bth_set_tver(void *arg, u8 tver) { struct rxe_bth *bth = arg; bth->flags = (BTH_TVER_MASK & tver) | (~BTH_TVER_MASK & bth->flags); } static inline u16 __bth_pkey(void *arg) { struct rxe_bth *bth = arg; return be16_to_cpu(bth->pkey); } static inline void __bth_set_pkey(void *arg, u16 pkey) { struct rxe_bth *bth = arg; bth->pkey = cpu_to_be16(pkey); } static inline u32 __bth_qpn(void *arg) { struct rxe_bth *bth = arg; return BTH_QPN_MASK & be32_to_cpu(bth->qpn); } static inline void __bth_set_qpn(void *arg, u32 qpn) { struct rxe_bth *bth = arg; u32 resvqpn = be32_to_cpu(bth->qpn); bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | (~BTH_QPN_MASK & resvqpn)); } static inline int __bth_fecn(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); } static inline void __bth_set_fecn(void *arg, int fecn) { struct rxe_bth *bth = arg; if (fecn) bth->qpn |= cpu_to_be32(BTH_FECN_MASK); else bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); } static inline int __bth_becn(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); } static inline void __bth_set_becn(void *arg, int becn) { struct rxe_bth *bth = arg; if (becn) bth->qpn |= cpu_to_be32(BTH_BECN_MASK); else bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); } static inline u8 __bth_resv6a(void *arg) { struct rxe_bth *bth = arg; return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; } static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); } static inline void __bth_set_ack(void *arg, int ack) { struct rxe_bth *bth = arg; if (ack) bth->apsn |= cpu_to_be32(BTH_ACK_MASK); else bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); } static inline void __bth_set_resv7(void *arg) { struct rxe_bth *bth = arg; bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); } static inline u32 __bth_psn(void *arg) { struct rxe_bth *bth = arg; return BTH_PSN_MASK & be32_to_cpu(bth->apsn); } static inline void __bth_set_psn(void *arg, u32 psn) { struct rxe_bth *bth = arg; u32 apsn = be32_to_cpu(bth->apsn); bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | (~BTH_PSN_MASK & apsn)); } static inline u8 bth_opcode(struct rxe_pkt_info *pkt) { return __bth_opcode(pkt->hdr); } static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) { __bth_set_opcode(pkt->hdr, opcode); } static inline u8 bth_se(struct rxe_pkt_info *pkt) { return __bth_se(pkt->hdr); } static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) { __bth_set_se(pkt->hdr, se); } static inline u8 bth_mig(struct rxe_pkt_info *pkt) { return __bth_mig(pkt->hdr); } static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) { __bth_set_mig(pkt->hdr, mig); } static inline u8 bth_pad(struct rxe_pkt_info *pkt) { return __bth_pad(pkt->hdr); } static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) { __bth_set_pad(pkt->hdr, pad); } static inline u8 bth_tver(struct rxe_pkt_info *pkt) { return __bth_tver(pkt->hdr); } static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) { __bth_set_tver(pkt->hdr, tver); } static inline u16 bth_pkey(struct rxe_pkt_info *pkt) { return __bth_pkey(pkt->hdr); } static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) { __bth_set_pkey(pkt->hdr, pkey); } static inline u32 bth_qpn(struct rxe_pkt_info *pkt) { return __bth_qpn(pkt->hdr); } static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) { __bth_set_qpn(pkt->hdr, qpn); } static inline int bth_fecn(struct rxe_pkt_info *pkt) { return __bth_fecn(pkt->hdr); } static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) { __bth_set_fecn(pkt->hdr, fecn); } static inline int bth_becn(struct rxe_pkt_info *pkt) { return __bth_becn(pkt->hdr); } static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) { __bth_set_becn(pkt->hdr, becn); } static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) { return __bth_resv6a(pkt->hdr); } static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) { __bth_set_resv6a(pkt->hdr); } static inline int bth_ack(struct rxe_pkt_info *pkt) { return __bth_ack(pkt->hdr); } static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) { __bth_set_ack(pkt->hdr, ack); } static inline void bth_set_resv7(struct rxe_pkt_info *pkt) { __bth_set_resv7(pkt->hdr); } static inline u32 bth_psn(struct rxe_pkt_info *pkt) { return __bth_psn(pkt->hdr); } static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) { __bth_set_psn(pkt->hdr, psn); } static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, int mig, int pad, u16 pkey, u32 qpn, int ack_req, u32 psn) { struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr); bth->opcode = opcode; bth->flags = (pad << 4) & BTH_PAD_MASK; if (se) bth->flags |= BTH_SE_MASK; if (mig) bth->flags |= BTH_MIG_MASK; bth->pkey = cpu_to_be16(pkey); bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); psn &= BTH_PSN_MASK; if (ack_req) psn |= BTH_ACK_MASK; bth->apsn = cpu_to_be32(psn); } /****************************************************************************** * Reliable Datagram Extended Transport Header ******************************************************************************/ struct rxe_rdeth { __be32 een; }; #define RDETH_EEN_MASK (0x00ffffff) static inline u8 __rdeth_een(void *arg) { struct rxe_rdeth *rdeth = arg; return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); } static inline void __rdeth_set_een(void *arg, u32 een) { struct rxe_rdeth *rdeth = arg; rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); } static inline u8 rdeth_een(struct rxe_pkt_info *pkt) { return __rdeth_een(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); } static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) { __rdeth_set_een(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); } /****************************************************************************** * Datagram Extended Transport Header ******************************************************************************/ struct rxe_deth { __be32 qkey; __be32 sqp; }; #define GSI_QKEY (0x80010000) #define DETH_SQP_MASK (0x00ffffff) static inline u32 __deth_qkey(void *arg) { struct rxe_deth *deth = arg; return be32_to_cpu(deth->qkey); } static inline void __deth_set_qkey(void *arg, u32 qkey) { struct rxe_deth *deth = arg; deth->qkey = cpu_to_be32(qkey); } static inline u32 __deth_sqp(void *arg) { struct rxe_deth *deth = arg; return DETH_SQP_MASK & be32_to_cpu(deth->sqp); } static inline void __deth_set_sqp(void *arg, u32 sqp) { struct rxe_deth *deth = arg; deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); } static inline u32 deth_qkey(struct rxe_pkt_info *pkt) { return __deth_qkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH]); } static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) { __deth_set_qkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); } static inline u32 deth_sqp(struct rxe_pkt_info *pkt) { return __deth_sqp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH]); } static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) { __deth_set_sqp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); } /****************************************************************************** * RDMA Extended Transport Header ******************************************************************************/ struct rxe_reth { __be64 va; __be32 rkey; __be32 len; }; static inline u64 __reth_va(void *arg) { struct rxe_reth *reth = arg; return be64_to_cpu(reth->va); } static inline void __reth_set_va(void *arg, u64 va) { struct rxe_reth *reth = arg; reth->va = cpu_to_be64(va); } static inline u32 __reth_rkey(void *arg) { struct rxe_reth *reth = arg; return be32_to_cpu(reth->rkey); } static inline void __reth_set_rkey(void *arg, u32 rkey) { struct rxe_reth *reth = arg; reth->rkey = cpu_to_be32(rkey); } static inline u32 __reth_len(void *arg) { struct rxe_reth *reth = arg; return be32_to_cpu(reth->len); } static inline void __reth_set_len(void *arg, u32 len) { struct rxe_reth *reth = arg; reth->len = cpu_to_be32(len); } static inline u64 reth_va(struct rxe_pkt_info *pkt) { return __reth_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) { __reth_set_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); } static inline u32 reth_rkey(struct rxe_pkt_info *pkt) { return __reth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __reth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); } static inline u32 reth_len(struct rxe_pkt_info *pkt) { return __reth_len(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) { __reth_set_len(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); } /****************************************************************************** * FLUSH Extended Transport Header ******************************************************************************/ struct rxe_feth { __be32 bits; }; #define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */ #define FETH_SEL_MASK (0x00000030) /* bits 5-4 */ #define FETH_SEL_SHIFT (4U) static inline u32 __feth_plt(void *arg) { struct rxe_feth *feth = arg; return be32_to_cpu(feth->bits) & FETH_PLT_MASK; } static inline u32 __feth_sel(void *arg) { struct rxe_feth *feth = arg; return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT; } static inline u32 feth_plt(struct rxe_pkt_info *pkt) { return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); } static inline u32 feth_sel(struct rxe_pkt_info *pkt) { return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); } static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level) { struct rxe_feth *feth = (struct rxe_feth *) (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) | (type & FETH_PLT_MASK); feth->bits = cpu_to_be32(bits); } /****************************************************************************** * Atomic Extended Transport Header ******************************************************************************/ struct rxe_atmeth { __be64 va; __be32 rkey; __be64 swap_add; __be64 comp; } __packed; static inline u64 __atmeth_va(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->va); } static inline void __atmeth_set_va(void *arg, u64 va) { struct rxe_atmeth *atmeth = arg; atmeth->va = cpu_to_be64(va); } static inline u32 __atmeth_rkey(void *arg) { struct rxe_atmeth *atmeth = arg; return be32_to_cpu(atmeth->rkey); } static inline void __atmeth_set_rkey(void *arg, u32 rkey) { struct rxe_atmeth *atmeth = arg; atmeth->rkey = cpu_to_be32(rkey); } static inline u64 __atmeth_swap_add(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->swap_add); } static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) { struct rxe_atmeth *atmeth = arg; atmeth->swap_add = cpu_to_be64(swap_add); } static inline u64 __atmeth_comp(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->comp); } static inline void __atmeth_set_comp(void *arg, u64 comp) { struct rxe_atmeth *atmeth = arg; atmeth->comp = cpu_to_be64(comp); } static inline u64 atmeth_va(struct rxe_pkt_info *pkt) { return __atmeth_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) { __atmeth_set_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); } static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) { return __atmeth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __atmeth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); } static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) { return __atmeth_swap_add(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) { __atmeth_set_swap_add(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); } static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) { return __atmeth_comp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) { __atmeth_set_comp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); } /****************************************************************************** * Ack Extended Transport Header ******************************************************************************/ struct rxe_aeth { __be32 smsn; }; #define AETH_SYN_MASK (0xff000000) #define AETH_MSN_MASK (0x00ffffff) enum aeth_syndrome { AETH_TYPE_MASK = 0xe0, AETH_ACK = 0x00, AETH_RNR_NAK = 0x20, AETH_RSVD = 0x40, AETH_NAK = 0x60, AETH_ACK_UNLIMITED = 0x1f, AETH_NAK_PSN_SEQ_ERROR = 0x60, AETH_NAK_INVALID_REQ = 0x61, AETH_NAK_REM_ACC_ERR = 0x62, AETH_NAK_REM_OP_ERR = 0x63, }; static inline u8 __aeth_syn(void *arg) { struct rxe_aeth *aeth = arg; return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; } static inline void __aeth_set_syn(void *arg, u8 syn) { struct rxe_aeth *aeth = arg; u32 smsn = be32_to_cpu(aeth->smsn); aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | (~AETH_SYN_MASK & smsn)); } static inline u32 __aeth_msn(void *arg) { struct rxe_aeth *aeth = arg; return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); } static inline void __aeth_set_msn(void *arg, u32 msn) { struct rxe_aeth *aeth = arg; u32 smsn = be32_to_cpu(aeth->smsn); aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | (~AETH_MSN_MASK & smsn)); } static inline u8 aeth_syn(struct rxe_pkt_info *pkt) { return __aeth_syn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH]); } static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) { __aeth_set_syn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); } static inline u32 aeth_msn(struct rxe_pkt_info *pkt) { return __aeth_msn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH]); } static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) { __aeth_set_msn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); } /****************************************************************************** * Atomic Ack Extended Transport Header ******************************************************************************/ struct rxe_atmack { __be64 orig; }; static inline u64 __atmack_orig(void *arg) { struct rxe_atmack *atmack = arg; return be64_to_cpu(atmack->orig); } static inline void __atmack_set_orig(void *arg, u64 orig) { struct rxe_atmack *atmack = arg; atmack->orig = cpu_to_be64(orig); } static inline u64 atmack_orig(struct rxe_pkt_info *pkt) { return __atmack_orig(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); } static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) { __atmack_set_orig(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); } /****************************************************************************** * Immediate Extended Transport Header ******************************************************************************/ struct rxe_immdt { __be32 imm; }; static inline __be32 __immdt_imm(void *arg) { struct rxe_immdt *immdt = arg; return immdt->imm; } static inline void __immdt_set_imm(void *arg, __be32 imm) { struct rxe_immdt *immdt = arg; immdt->imm = imm; } static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) { return __immdt_imm(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); } static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) { __immdt_set_imm(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); } /****************************************************************************** * Invalidate Extended Transport Header ******************************************************************************/ struct rxe_ieth { __be32 rkey; }; static inline u32 __ieth_rkey(void *arg) { struct rxe_ieth *ieth = arg; return be32_to_cpu(ieth->rkey); } static inline void __ieth_set_rkey(void *arg, u32 rkey) { struct rxe_ieth *ieth = arg; ieth->rkey = cpu_to_be32(rkey); } static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) { return __ieth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IETH]); } static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __ieth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); } enum rxe_hdr_length { RXE_BTH_BYTES = sizeof(struct rxe_bth), RXE_DETH_BYTES = sizeof(struct rxe_deth), RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), RXE_RETH_BYTES = sizeof(struct rxe_reth), RXE_AETH_BYTES = sizeof(struct rxe_aeth), RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), RXE_IETH_BYTES = sizeof(struct rxe_ieth), RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), RXE_FETH_BYTES = sizeof(struct rxe_feth), }; static inline size_t header_size(struct rxe_pkt_info *pkt) { return rxe_opcode[pkt->opcode].length; } static inline void *payload_addr(struct rxe_pkt_info *pkt) { return pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; } static inline size_t payload_size(struct rxe_pkt_info *pkt) { return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] - bth_pad(pkt) - RXE_ICRC_SIZE; } #endif /* RXE_HDR_H */
3 3 1 2 2 3 54 884 876 13 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const void *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL");
13 13 12 20 20 2 75 5 1 65 1 2 1 51 1 15 1 46 3 1 19 22 64 1 5 3 2 1 5 56 55 9 5 52 54 1 7 3 47 1 55 1 1 1 37 37 36 34 36 36 36 36 36 36 21 33 46 47 17 1 16 1 4 10 10 2 2 1 2 1 1 4 4 4 4 23 14 37 37 37 37 17 17 16 17 17 17 17 16 17 17 17 17 11 6 13 4 4 4 17 17 17 4 5 25 25 2 19 21 3 18 18 4 3 1 13 1 1 7 10 4 4 4 13 17 17 44 40 26 2 25 23 1 90 85 3 12 12 12 5 7 7 83 44 5 39 5 79 5 6 1 1 3 2 1 1 1 1 2 1 1 67 67 79 79 1 78 77 76 74 69 48 1 6 1 5 4 4 1 46 8 2 45 45 1 2 1 20 37 6 84 4 80 79 2 76 71 7 69 69 61 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 // SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. */ #include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/jiffies.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> #include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/inet_ecn.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #endif /* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } static u32 tcp_v6_init_seq(const struct sk_buff *skb) { return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; struct flowi6 fl6; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * TCP over IPv4 */ if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; if (ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { saddr = &fl6.saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) goto failure; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcpv6_seq(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto late_failure; err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); /* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late. */ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return; dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } } static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct net *net = dev_net(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; bool fatal; int err; sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, fatal); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { u32 mtu = ntohl(info); /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; if (!ip6_sk_accept_pmtu(sk)) goto out; if (mtu < IPV6_MIN_MTU) goto out; WRITE_ONCE(tp->mtu_info, mtu); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); goto out; } /* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); if (!sock_owned_by_user(sk)) tcp_done_with_error(sk, err); else WRITE_ONCE(sk->sk_err_soft, err); goto out; case TCP_LISTEN: break; default: /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && type == ICMPV6_DEST_UNREACH && code == ICMPV6_NOROUTE) tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tclass |= INET_ECN_ECT_0; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } done: return err; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr, int l3index) { return tcp_md5_do_lookup(sk, l3index, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) && prefixlen > 32)) return -EINVAL; } else { prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } addr = (union tcp_md5_addr *)&sin6->sin6_addr; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } static int tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; struct tcp_sigpool hp; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb, u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup_rsk, .ao_calc_key = tcp_v6_ao_calc_key_rsk, .ao_synack_hash = tcp_v6_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_seq, .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; struct sk_buff *buff; struct flowi6 fl6; struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 mrst = 0, *topt; struct dst_entry *dst; __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key)) tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) tot_len += sizeof(__be32); } #endif buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; t1->seq = htonl(seq); t1->ack_seq = htonl(ack); t1->ack = !rst || !th->ack; t1->rst = rst; t1->window = htons(win); topt = (__be32 *)(t1 + 1); if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *topt++ = htonl(tsval); *topt++ = htonl(tsecr); } if (mrst) *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { *topt++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (key->rcv_next)); tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, t1, key->sne); } #endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; fl6.flowi6_oif = oif; } if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ if (sk && sk->sk_state != TCP_TIME_WAIT) dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass & ~INET_ECN_MASK, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } kfree_skb(buff); } static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); const __u8 *md5_hash_location = NULL; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) bool allocated_traffic_key = false; #endif const struct tcp_ao_hdr *aoh; struct tcp_key key = {}; u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && !ipv6_unicast_destination(skb)) return; net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = tcp_v6_sdif(skb) ? dif : 0; key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key.md5_key) goto out; key.type = TCP_KEY_MD5; genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif if (th->ack) seq = ntohl(th->ack_seq); else ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); #ifdef CONFIG_TCP_AO if (aoh) { int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, &key.ao_key, &key.traffic_key, &allocated_traffic_key, &key.rcv_next, &key.sne)) goto out; key.type = TCP_KEY_AO; } #endif if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: if (allocated_traffic_key) kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, tclass, label, priority, txhash, key); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) key.ao_key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); /* rcv_next switches to our rcv_next */ rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.sne = READ_ONCE(ao_info->snd_sne); key.type = TCP_KEY_AO; #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: #endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; const struct tcp_ao_hdr *aoh; int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; } u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, th); if (mss) { *cookie = __cookie_v6_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void tcp_v6_restore_cb(struct sk_buff *skb) { /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. */ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm)); } static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; int l3index; #endif struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (!newsk) return NULL; inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto out; /* OOM */ #endif if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); } } else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: tcp_listendrop(sk); return NULL; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); if (nsk != sk) { if (nsk) { reason = tcp_child_process(sk, nsk, skb); if (reason) goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; ipv6_pktoptions: /* Do you ask, what is it? 1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive. */ tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } consume_skb(opt_skb); return 0; } static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, const struct tcphdr *th) { /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; struct sock *sk = NULL; bool refcounted; int ret; u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* * Count it even if it's bad. */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v6_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { enum sk_rst_reason rst_reason; rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); return 0; } } process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v6_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret ? -1 : 0; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: ; } goto discard_it; } void tcp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; hdr = ipv6_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } } static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v6_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, #endif }; #endif /* * TCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; u8 icsk_pending; int rx_queue; int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl " "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" " uid timeout inode\n"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait6_sock(seq, v, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); else get_tcp6_sock(seq, v, st->num); out: return 0; } static const struct seq_operations tcp6_seq_ops = { .show = tcp6_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .family = AF_INET6, }; int __net_init tcp6_proc_init(struct net *net) { if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops, sizeof(struct tcp_iter_state), &tcp6_seq_afinfo)) return -ENOMEM; return 0; } void tcp6_proc_exit(struct net *net) { remove_proc_entry("tcp6", net->proc_net); } #endif struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; static int __net_init tcpv6_net_init(struct net *net) { int res; res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); if (!res) net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; return res; } static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, }; int __init tcpv6_init(void) { int ret; net_hotdata.tcpv6_protocol = (struct inet6_protocol) { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; /* register inet6 protocol */ ret = inet6_register_protosw(&tcpv6_protosw); if (ret) goto out_tcpv6_protocol; ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; ret = mptcpv6_init(); if (ret) goto out_tcpv6_pernet_subsys; out: return ret; out_tcpv6_pernet_subsys: unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); }
1 25 17 5 26 10 220 1 1 1 1 1 1 1 1 1 1 1 1 224 227 1 1 1 76 90 138 229 1 97 179 317 138 229 305 381 382 40 321 1 1 1 1 1 1 1 9 9 4 9 8 8 2 2 2 9 8 2 2 11 9 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bitmap.h> #include <linux/in6.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rhashtable.h> #include <linux/spinlock_types.h> #include <linux/types.h> #include <net/fib_notifier.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/fib_rules.h> #include <net/net_namespace.h> #include <net/nexthop.h> #include <linux/debugfs.h> #include "netdevsim.h" struct nsim_fib_entry { u64 max; atomic64_t num; }; struct nsim_per_fib_data { struct nsim_fib_entry fib; struct nsim_fib_entry rules; }; struct nsim_fib_data { struct notifier_block fib_nb; struct nsim_per_fib_data ipv4; struct nsim_per_fib_data ipv6; struct nsim_fib_entry nexthops; struct rhashtable fib_rt_ht; struct list_head fib_rt_list; struct mutex fib_lock; /* Protects FIB HT and list */ struct notifier_block nexthop_nb; struct rhashtable nexthop_ht; struct devlink *devlink; struct work_struct fib_event_work; struct work_struct fib_flush_work; struct list_head fib_event_queue; spinlock_t fib_event_queue_lock; /* Protects fib event queue list */ struct mutex nh_lock; /* Protects NH HT */ struct dentry *ddir; bool fail_route_offload; bool fail_res_nexthop_group_replace; bool fail_nexthop_bucket_replace; bool fail_route_delete; }; struct nsim_fib_rt_key { unsigned char addr[sizeof(struct in6_addr)]; unsigned char prefix_len; int family; u32 tb_id; }; struct nsim_fib_rt { struct nsim_fib_rt_key key; struct rhash_head ht_node; struct list_head list; /* Member of fib_rt_list */ }; struct nsim_fib4_rt { struct nsim_fib_rt common; struct fib_info *fi; dscp_t dscp; u8 type; }; struct nsim_fib6_rt { struct nsim_fib_rt common; struct list_head nh_list; unsigned int nhs; }; struct nsim_fib6_rt_nh { struct list_head list; /* Member of nh_list */ struct fib6_info *rt; }; struct nsim_fib6_event { struct fib6_info **rt_arr; unsigned int nrt6; }; struct nsim_fib_event { struct list_head list; /* node in fib queue */ union { struct fib_entry_notifier_info fen_info; struct nsim_fib6_event fib6_event; }; struct nsim_fib_data *data; unsigned long event; int family; }; static const struct rhashtable_params nsim_fib_rt_ht_params = { .key_offset = offsetof(struct nsim_fib_rt, key), .head_offset = offsetof(struct nsim_fib_rt, ht_node), .key_len = sizeof(struct nsim_fib_rt_key), .automatic_shrinking = true, }; struct nsim_nexthop { struct rhash_head ht_node; u64 occ; u32 id; bool is_resilient; }; static const struct rhashtable_params nsim_nexthop_ht_params = { .key_offset = offsetof(struct nsim_nexthop, id), .head_offset = offsetof(struct nsim_nexthop, ht_node), .key_len = sizeof(u32), .automatic_shrinking = true, }; u64 nsim_fib_get_val(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, bool max) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: return 0; } return max ? entry->max : atomic64_read(&entry->num); } static void nsim_fib_set_max(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, u64 val) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: WARN_ON(1); return; } entry->max = val; } static int nsim_fib_rule_account(struct nsim_fib_entry *entry, bool add, struct netlink_ext_ack *extack) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib rule entries"); } } else { atomic64_dec_if_positive(&entry->num); } return err; } static int nsim_fib_rule_event(struct nsim_fib_data *data, struct fib_notifier_info *info, bool add) { struct netlink_ext_ack *extack = info->extack; int err = 0; switch (info->family) { case AF_INET: err = nsim_fib_rule_account(&data->ipv4.rules, add, extack); break; case AF_INET6: err = nsim_fib_rule_account(&data->ipv6.rules, add, extack); break; } return err; } static int nsim_fib_account(struct nsim_fib_entry *entry, bool add) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) err = -ENOSPC; } else { atomic64_dec_if_positive(&entry->num); } return err; } static void nsim_fib_rt_init(struct nsim_fib_data *data, struct nsim_fib_rt *fib_rt, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { memcpy(fib_rt->key.addr, addr, addr_len); fib_rt->key.prefix_len = prefix_len; fib_rt->key.family = family; fib_rt->key.tb_id = tb_id; list_add(&fib_rt->list, &data->fib_rt_list); } static void nsim_fib_rt_fini(struct nsim_fib_rt *fib_rt) { list_del(&fib_rt->list); } static struct nsim_fib_rt *nsim_fib_rt_lookup(struct rhashtable *fib_rt_ht, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { struct nsim_fib_rt_key key; memset(&key, 0, sizeof(key)); memcpy(key.addr, addr, addr_len); key.prefix_len = prefix_len; key.family = family; key.tb_id = tb_id; return rhashtable_lookup_fast(fib_rt_ht, &key, nsim_fib_rt_ht_params); } static struct nsim_fib4_rt * nsim_fib4_rt_create(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = kzalloc(sizeof(*fib4_rt), GFP_KERNEL); if (!fib4_rt) return NULL; nsim_fib_rt_init(data, &fib4_rt->common, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); fib4_rt->fi = fen_info->fi; fib_info_hold(fib4_rt->fi); fib4_rt->dscp = fen_info->dscp; fib4_rt->type = fen_info->type; return fib4_rt; } static void nsim_fib4_rt_destroy(struct nsim_fib4_rt *fib4_rt) { fib_info_put(fib4_rt->fi); nsim_fib_rt_fini(&fib4_rt->common); kfree(fib4_rt); } static struct nsim_fib4_rt * nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib4_rt, common); } static void nsim_fib4_rt_offload_failed_flag_set(struct net *net, struct fib_entry_notifier_info *fen_info) { u32 *p_dst = (u32 *)&fen_info->dst; struct fib_rt_info fri; fri.fi = fen_info->fi; fri.tb_id = fen_info->tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = fen_info->dst_len; fri.dscp = fen_info->dscp; fri.type = fen_info->type; fri.offload = false; fri.trap = false; fri.offload_failed = true; fib_alias_hw_flags_set(net, &fri); } static void nsim_fib4_rt_hw_flags_set(struct net *net, const struct nsim_fib4_rt *fib4_rt, bool trap) { u32 *p_dst = (u32 *) fib4_rt->common.key.addr; int dst_len = fib4_rt->common.key.prefix_len; struct fib_rt_info fri; fri.fi = fib4_rt->fi; fri.tb_id = fib4_rt->common.key.tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = dst_len; fri.dscp = fib4_rt->dscp; fri.type = fib4_rt->type; fri.offload = false; fri.trap = trap; fri.offload_failed = false; fib_alias_hw_flags_set(net, &fri); } static int nsim_fib4_rt_add(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt) { struct net *net = devlink_net(data->devlink); int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; /* Simulate hardware programming latency. */ msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv4.fib, false); return err; } static int nsim_fib4_rt_replace(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt, struct nsim_fib4_rt *fib4_rt_old) { struct net *net = devlink_net(data->devlink); int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv4.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib4_rt_old->common.ht_node, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); nsim_fib4_rt_hw_flags_set(net, fib4_rt_old, false); nsim_fib4_rt_destroy(fib4_rt_old); return 0; } static int nsim_fib4_rt_insert(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt, *fib4_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib4_rt = nsim_fib4_rt_create(data, fen_info); if (!fib4_rt) return -ENOMEM; fib4_rt_old = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt_old) err = nsim_fib4_rt_add(data, fib4_rt); else err = nsim_fib4_rt_replace(data, fib4_rt, fib4_rt_old); if (err) nsim_fib4_rt_destroy(fib4_rt); return err; } static void nsim_fib4_rt_remove(struct nsim_fib_data *data, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt) return; rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib4_rt_destroy(fib4_rt); } static int nsim_fib4_event(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info, unsigned long event) { int err = 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib4_rt_insert(data, fen_info); if (err) { struct net *net = devlink_net(data->devlink); nsim_fib4_rt_offload_failed_flag_set(net, fen_info); } break; case FIB_EVENT_ENTRY_DEL: nsim_fib4_rt_remove(data, fen_info); break; default: break; } return err; } static struct nsim_fib6_rt_nh * nsim_fib6_rt_nh_find(const struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) { if (fib6_rt_nh->rt == rt) return fib6_rt_nh; } return NULL; } static int nsim_fib6_rt_nh_add(struct nsim_fib6_rt *fib6_rt, struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = kzalloc(sizeof(*fib6_rt_nh), GFP_KERNEL); if (!fib6_rt_nh) return -ENOMEM; fib6_info_hold(rt); fib6_rt_nh->rt = rt; list_add_tail(&fib6_rt_nh->list, &fib6_rt->nh_list); fib6_rt->nhs++; return 0; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_rt6_release(struct fib6_info *rt) { fib6_info_release(rt); } #else static void nsim_rt6_release(struct fib6_info *rt) { } #endif static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt); if (!fib6_rt_nh) return; fib6_rt->nhs--; list_del(&fib6_rt_nh->list); nsim_rt6_release(fib6_rt_nh->rt); kfree(fib6_rt_nh); } static struct nsim_fib6_rt * nsim_fib6_rt_create(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct fib6_info *rt = rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i = 0; int err; fib6_rt = kzalloc(sizeof(*fib6_rt), GFP_KERNEL); if (!fib6_rt) return ERR_PTR(-ENOMEM); nsim_fib_rt_init(data, &fib6_rt->common, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); /* We consider a multipath IPv6 route as one entry, but it can be made * up from several fib6_info structs (one for each nexthop), so we * add them all to the same list under the entry. */ INIT_LIST_HEAD(&fib6_rt->nh_list); for (i = 0; i < nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, rt_arr[i]); if (err) goto err_fib6_rt_nh_del; } return fib6_rt; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { nsim_fib6_rt_nh_del(fib6_rt, rt_arr[i]); } nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); return ERR_PTR(err); } static void nsim_fib6_rt_destroy(struct nsim_fib6_rt *fib6_rt) { struct nsim_fib6_rt_nh *iter, *tmp; list_for_each_entry_safe(iter, tmp, &fib6_rt->nh_list, list) nsim_fib6_rt_nh_del(fib6_rt, iter->rt); WARN_ON_ONCE(!list_empty(&fib6_rt->nh_list)); nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); } static struct nsim_fib6_rt * nsim_fib6_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib6_info *rt) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib6_rt, common); } static int nsim_fib6_rt_append(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i, err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return -EINVAL; for (i = 0; i < fib6_event->nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, fib6_event->rt_arr[i]); if (err) goto err_fib6_rt_nh_del; WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct net *net = devlink_net(data->devlink); int i; for (i = 0; i < nrt6; i++) fib6_info_hw_flags_set(net, rt_arr[i], false, false, true); } #else static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { } #endif #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { struct net *net = devlink_net(data->devlink); struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false); } #else static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { } #endif static int nsim_fib6_rt_add(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt) { int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv6.fib, false); return err; } static int nsim_fib6_rt_replace(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt, struct nsim_fib6_rt *fib6_rt_old) { int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv6.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib6_rt_old->common.ht_node, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); nsim_fib6_rt_hw_flags_set(data, fib6_rt_old, false); nsim_fib6_rt_destroy(fib6_rt_old); return 0; } static int nsim_fib6_rt_insert(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt, *fib6_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_create(data, fib6_event->rt_arr, fib6_event->nrt6); if (IS_ERR(fib6_rt)) return PTR_ERR(fib6_rt); fib6_rt_old = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt_old) err = nsim_fib6_rt_add(data, fib6_rt); else err = nsim_fib6_rt_replace(data, fib6_rt, fib6_rt_old); if (err) nsim_fib6_rt_destroy(fib6_rt); return err; } static void nsim_fib6_rt_remove(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i; /* Multipath routes are first added to the FIB trie and only then * notified. If we vetoed the addition, we will get a delete * notification for a route we do not have. Therefore, do not warn if * route was not found. */ fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return; /* If not all the nexthops are deleted, then only reduce the nexthop * group. */ if (fib6_event->nrt6 != fib6_rt->nhs) { for (i = 0; i < fib6_event->nrt6; i++) nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); return; } rhashtable_remove_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib6_rt_destroy(fib6_rt); } static int nsim_fib6_event_init(struct nsim_fib6_event *fib6_event, struct fib6_entry_notifier_info *fen6_info) { struct fib6_info *rt = fen6_info->rt; struct fib6_info **rt_arr; struct fib6_info *iter; unsigned int nrt6; int i = 0; nrt6 = fen6_info->nsiblings + 1; rt_arr = kcalloc(nrt6, sizeof(struct fib6_info *), GFP_ATOMIC); if (!rt_arr) return -ENOMEM; fib6_event->rt_arr = rt_arr; fib6_event->nrt6 = nrt6; rt_arr[0] = rt; fib6_info_hold(rt); if (!fen6_info->nsiblings) return 0; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (i == fen6_info->nsiblings) break; rt_arr[i + 1] = iter; fib6_info_hold(iter); i++; } WARN_ON_ONCE(i != fen6_info->nsiblings); return 0; } static void nsim_fib6_event_fini(struct nsim_fib6_event *fib6_event) { int i; for (i = 0; i < fib6_event->nrt6; i++) nsim_rt6_release(fib6_event->rt_arr[i]); kfree(fib6_event->rt_arr); } static int nsim_fib6_event(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event, unsigned long event) { int err; if (fib6_event->rt_arr[0]->fib6_src.plen) return 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib6_rt_insert(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_APPEND: err = nsim_fib6_rt_append(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_DEL: nsim_fib6_rt_remove(data, fib6_event); break; default: break; } return 0; err_rt_offload_failed_flag_set: nsim_fib6_rt_offload_failed_flag_set(data, fib6_event->rt_arr, fib6_event->nrt6); return err; } static void nsim_fib_event(struct nsim_fib_event *fib_event) { switch (fib_event->family) { case AF_INET: nsim_fib4_event(fib_event->data, &fib_event->fen_info, fib_event->event); fib_info_put(fib_event->fen_info.fi); break; case AF_INET6: nsim_fib6_event(fib_event->data, &fib_event->fib6_event, fib_event->event); nsim_fib6_event_fini(&fib_event->fib6_event); break; } } static int nsim_fib4_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib_entry_notifier_info *fen_info; struct netlink_ext_ack *extack; int err = 0; fen_info = container_of(info, struct fib_entry_notifier_info, info); fib_event->fen_info = *fen_info; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv4.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); return err; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); return -EINVAL; } nsim_fib_account(&data->ipv4.fib, false); break; } /* Take reference on fib_info to prevent it from being * freed while event is queued. Release it afterwards. */ fib_info_hold(fib_event->fen_info.fi); return 0; } static int nsim_fib6_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib6_entry_notifier_info *fen6_info; struct netlink_ext_ack *extack; int err = 0; fen6_info = container_of(info, struct fib6_entry_notifier_info, info); err = nsim_fib6_event_init(&fib_event->fib6_event, fen6_info); if (err) return err; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv6.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); goto err_fib6_event_fini; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { err = -EINVAL; NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); goto err_fib6_event_fini; } nsim_fib_account(&data->ipv6.fib, false); break; } return 0; err_fib6_event_fini: nsim_fib6_event_fini(&fib_event->fib6_event); return err; } static int nsim_fib_event_schedule_work(struct nsim_fib_data *data, struct fib_notifier_info *info, unsigned long event) { struct nsim_fib_event *fib_event; int err; if (info->family != AF_INET && info->family != AF_INET6) /* netdevsim does not support 'RTNL_FAMILY_IP6MR' and * 'RTNL_FAMILY_IPMR' and should ignore them. */ return NOTIFY_DONE; fib_event = kzalloc(sizeof(*fib_event), GFP_ATOMIC); if (!fib_event) goto err_fib_event_alloc; fib_event->data = data; fib_event->event = event; fib_event->family = info->family; switch (info->family) { case AF_INET: err = nsim_fib4_prepare_event(info, fib_event, event); break; case AF_INET6: err = nsim_fib6_prepare_event(info, fib_event, event); break; } if (err) goto err_fib_prepare_event; /* Enqueue the event and trigger the work */ spin_lock_bh(&data->fib_event_queue_lock); list_add_tail(&fib_event->list, &data->fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); schedule_work(&data->fib_event_work); return NOTIFY_DONE; err_fib_prepare_event: kfree(fib_event); err_fib_event_alloc: if (event == FIB_EVENT_ENTRY_DEL) schedule_work(&data->fib_flush_work); return NOTIFY_BAD; } static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct fib_notifier_info *info = ptr; int err; switch (event) { case FIB_EVENT_RULE_ADD: case FIB_EVENT_RULE_DEL: err = nsim_fib_rule_event(data, info, event == FIB_EVENT_RULE_ADD); return notifier_from_errno(err); case FIB_EVENT_ENTRY_REPLACE: case FIB_EVENT_ENTRY_APPEND: case FIB_EVENT_ENTRY_DEL: return nsim_fib_event_schedule_work(data, info, event); } return NOTIFY_DONE; } static void nsim_fib4_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct devlink *devlink = data->devlink; struct nsim_fib4_rt *fib4_rt; fib4_rt = container_of(fib_rt, struct nsim_fib4_rt, common); nsim_fib4_rt_hw_flags_set(devlink_net(devlink), fib4_rt, false); nsim_fib_account(&data->ipv4.fib, false); nsim_fib4_rt_destroy(fib4_rt); } static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct nsim_fib6_rt *fib6_rt; fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); nsim_fib6_rt_hw_flags_set(data, fib6_rt, false); nsim_fib_account(&data->ipv6.fib, false); nsim_fib6_rt_destroy(fib6_rt); } static void nsim_fib_rt_free(void *ptr, void *arg) { struct nsim_fib_rt *fib_rt = ptr; struct nsim_fib_data *data = arg; switch (fib_rt->key.family) { case AF_INET: nsim_fib4_rt_free(fib_rt, data); break; case AF_INET6: nsim_fib6_rt_free(fib_rt, data); break; default: WARN_ON_ONCE(1); } } /* inconsistent dump, trying again */ static void nsim_fib_dump_inconsistent(struct notifier_block *nb) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Flush the work to make sure there is no race with notifications. */ flush_work(&data->fib_event_work); /* The notifier block is still not registered, so we do not need to * take any locks here. */ list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } atomic64_set(&data->ipv4.rules.num, 0ULL); atomic64_set(&data->ipv6.rules.num, 0ULL); } static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; u64 occ = 0; int i; nexthop = kzalloc(sizeof(*nexthop), GFP_KERNEL); if (!nexthop) return ERR_PTR(-ENOMEM); nexthop->id = info->id; /* Determine the number of nexthop entries the new nexthop will * occupy. */ switch (info->type) { case NH_NOTIFIER_INFO_TYPE_SINGLE: occ = 1; break; case NH_NOTIFIER_INFO_TYPE_GRP: for (i = 0; i < info->nh_grp->num_nh; i++) occ += info->nh_grp->nh_entries[i].weight; break; case NH_NOTIFIER_INFO_TYPE_RES_TABLE: occ = info->nh_res_table->num_nh_buckets; nexthop->is_resilient = true; break; default: NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); kfree(nexthop); return ERR_PTR(-EOPNOTSUPP); } nexthop->occ = occ; return nexthop; } static void nsim_nexthop_destroy(struct nsim_nexthop *nexthop) { kfree(nexthop); } static int nsim_nexthop_account(struct nsim_fib_data *data, u64 occ, bool add, struct netlink_ext_ack *extack) { int i, err = 0; if (add) { for (i = 0; i < occ; i++) if (!atomic64_add_unless(&data->nexthops.num, 1, data->nexthops.max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported nexthops"); goto err_num_decrease; } } else { if (WARN_ON(occ > atomic64_read(&data->nexthops.num))) return -EINVAL; atomic64_sub(occ, &data->nexthops.num); } return err; err_num_decrease: atomic64_sub(i, &data->nexthops.num); return err; } static void nsim_nexthop_hw_flags_set(struct net *net, const struct nsim_nexthop *nexthop, bool trap) { int i; nexthop_set_hw_flags(net, nexthop->id, false, trap); if (!nexthop->is_resilient) return; for (i = 0; i < nexthop->occ; i++) nexthop_bucket_set_hw_flags(net, nexthop->id, i, false, trap); } static int nsim_nexthop_add(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_insert_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to insert nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_replace(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct nsim_nexthop *nexthop_old, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_replace_fast(&data->nexthop_ht, &nexthop_old->ht_node, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to replace nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); nsim_nexthop_account(data, nexthop_old->occ, false, extack); nsim_nexthop_destroy(nexthop_old); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_insert(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop, *nexthop_old; int err; nexthop = nsim_nexthop_create(data, info); if (IS_ERR(nexthop)) return PTR_ERR(nexthop); nexthop_old = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop_old) err = nsim_nexthop_add(data, nexthop, info->extack); else err = nsim_nexthop_replace(data, nexthop, nexthop_old, info->extack); if (err) nsim_nexthop_destroy(nexthop); return err; } static void nsim_nexthop_remove(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop) return; rhashtable_remove_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); nsim_nexthop_account(data, nexthop->occ, false, info->extack); nsim_nexthop_destroy(nexthop); } static int nsim_nexthop_res_table_pre_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_res_nexthop_group_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace a resilient nexthop group"); return -EINVAL; } return 0; } static int nsim_nexthop_bucket_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_nexthop_bucket_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace nexthop bucket"); return -EINVAL; } nexthop_bucket_set_hw_flags(info->net, info->id, info->nh_res_bucket->bucket_index, false, true); return 0; } static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, nexthop_nb); struct nh_notifier_info *info = ptr; int err = 0; mutex_lock(&data->nh_lock); switch (event) { case NEXTHOP_EVENT_REPLACE: err = nsim_nexthop_insert(data, info); break; case NEXTHOP_EVENT_DEL: nsim_nexthop_remove(data, info); break; case NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE: err = nsim_nexthop_res_table_pre_replace(data, info); break; case NEXTHOP_EVENT_BUCKET_REPLACE: err = nsim_nexthop_bucket_replace(data, info); break; default: break; } mutex_unlock(&data->nh_lock); return notifier_from_errno(err); } static void nsim_nexthop_free(void *ptr, void *arg) { struct nsim_nexthop *nexthop = ptr; struct nsim_fib_data *data = arg; struct net *net; net = devlink_net(data->devlink); nsim_nexthop_hw_flags_set(net, nexthop, false); nsim_nexthop_account(data, nexthop->occ, false, NULL); nsim_nexthop_destroy(nexthop); } static ssize_t nsim_nexthop_bucket_activity_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { struct nsim_fib_data *data = file->private_data; struct net *net = devlink_net(data->devlink); struct nsim_nexthop *nexthop; unsigned long *activity; loff_t pos = *ppos; u16 bucket_index; char buf[128]; int err = 0; u32 nhid; if (pos != 0) return -EINVAL; if (size > sizeof(buf) - 1) return -EINVAL; if (copy_from_user(buf, user_buf, size)) return -EFAULT; buf[size] = 0; if (sscanf(buf, "%u %hu", &nhid, &bucket_index) != 2) return -EINVAL; rtnl_lock(); nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &nhid, nsim_nexthop_ht_params); if (!nexthop || !nexthop->is_resilient || bucket_index >= nexthop->occ) { err = -EINVAL; goto out; } activity = bitmap_zalloc(nexthop->occ, GFP_KERNEL); if (!activity) { err = -ENOMEM; goto out; } bitmap_set(activity, bucket_index, 1); nexthop_res_grp_activity_update(net, nhid, nexthop->occ, activity); bitmap_free(activity); out: rtnl_unlock(); *ppos = size; return err ?: size; } static const struct file_operations nsim_nexthop_bucket_activity_fops = { .open = simple_open, .write = nsim_nexthop_bucket_activity_write, .owner = THIS_MODULE, }; static u64 nsim_fib_ipv4_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB, false); } static u64 nsim_fib_ipv4_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB_RULES, false); } static u64 nsim_fib_ipv6_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB, false); } static u64 nsim_fib_ipv6_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB_RULES, false); } static u64 nsim_fib_nexthops_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_NEXTHOPS, false); } static void nsim_fib_set_max_all(struct nsim_fib_data *data, struct devlink *devlink) { static const enum nsim_resource_id res_ids[] = { NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_NEXTHOPS, }; int i; for (i = 0; i < ARRAY_SIZE(res_ids); i++) { int err; u64 val; err = devl_resource_size_get(devlink, res_ids[i], &val); if (err) val = (u64) -1; nsim_fib_set_max(data, res_ids[i], val); } } static void nsim_fib_event_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_event_work); struct nsim_fib_event *fib_event, *next_fib_event; LIST_HEAD(fib_event_queue); spin_lock_bh(&data->fib_event_queue_lock); list_splice_init(&data->fib_event_queue, &fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_event, next_fib_event, &fib_event_queue, list) { nsim_fib_event(fib_event); list_del(&fib_event->list); kfree(fib_event); cond_resched(); } mutex_unlock(&data->fib_lock); } static void nsim_fib_flush_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_flush_work); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Process pending work. */ flush_work(&data->fib_event_work); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } mutex_unlock(&data->fib_lock); } static int nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev) { data->ddir = debugfs_create_dir("fib", nsim_dev->ddir); if (IS_ERR(data->ddir)) return PTR_ERR(data->ddir); data->fail_route_offload = false; debugfs_create_bool("fail_route_offload", 0600, data->ddir, &data->fail_route_offload); data->fail_res_nexthop_group_replace = false; debugfs_create_bool("fail_res_nexthop_group_replace", 0600, data->ddir, &data->fail_res_nexthop_group_replace); data->fail_nexthop_bucket_replace = false; debugfs_create_bool("fail_nexthop_bucket_replace", 0600, data->ddir, &data->fail_nexthop_bucket_replace); debugfs_create_file("nexthop_bucket_activity", 0200, data->ddir, data, &nsim_nexthop_bucket_activity_fops); data->fail_route_delete = false; debugfs_create_bool("fail_route_delete", 0600, data->ddir, &data->fail_route_delete); return 0; } static void nsim_fib_debugfs_exit(struct nsim_fib_data *data) { debugfs_remove_recursive(data->ddir); } struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, struct netlink_ext_ack *extack) { struct nsim_fib_data *data; struct nsim_dev *nsim_dev; int err; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); data->devlink = devlink; nsim_dev = devlink_priv(devlink); err = nsim_fib_debugfs_init(data, nsim_dev); if (err) goto err_data_free; mutex_init(&data->nh_lock); err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params); if (err) goto err_debugfs_exit; mutex_init(&data->fib_lock); INIT_LIST_HEAD(&data->fib_rt_list); err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params); if (err) goto err_rhashtable_nexthop_destroy; INIT_WORK(&data->fib_event_work, nsim_fib_event_work); INIT_WORK(&data->fib_flush_work, nsim_fib_flush_work); INIT_LIST_HEAD(&data->fib_event_queue); spin_lock_init(&data->fib_event_queue_lock); nsim_fib_set_max_all(data, devlink); data->nexthop_nb.notifier_call = nsim_nexthop_event_nb; err = register_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb, extack); if (err) { pr_err("Failed to register nexthop notifier\n"); goto err_rhashtable_fib_destroy; } data->fib_nb.notifier_call = nsim_fib_event_nb; err = register_fib_notifier(devlink_net(devlink), &data->fib_nb, nsim_fib_dump_inconsistent, extack); if (err) { pr_err("Failed to register fib notifier\n"); goto err_nexthop_nb_unregister; } devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB, nsim_fib_ipv4_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB_RULES, nsim_fib_ipv4_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB, nsim_fib_ipv6_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB_RULES, nsim_fib_ipv6_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_NEXTHOPS, nsim_fib_nexthops_res_occ_get, data); return data; err_nexthop_nb_unregister: unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); err_rhashtable_fib_destroy: cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); err_rhashtable_nexthop_destroy: rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); mutex_destroy(&data->fib_lock); err_debugfs_exit: mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); err_data_free: kfree(data); return ERR_PTR(err); } void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) { devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_NEXTHOPS); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); unregister_fib_notifier(devlink_net(devlink), &data->fib_nb); unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); WARN_ON_ONCE(!list_empty(&data->fib_event_queue)); WARN_ON_ONCE(!list_empty(&data->fib_rt_list)); mutex_destroy(&data->fib_lock); mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); kfree(data); }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2022 Intel Corporation * Copyright 2023-2024 NXP */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/iso.h> #include "eir.h" static const struct proto_ops iso_sock_ops; static struct bt_sock_list iso_sk_list = { .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock) }; /* ---- ISO connections ---- */ struct iso_conn { struct hci_conn *hcon; /* @lock: spinlock protecting changes to iso_conn fields */ spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; struct sk_buff *rx_skb; __u32 rx_len; __u16 tx_sn; struct kref ref; }; #define iso_conn_lock(c) spin_lock(&(c)->lock) #define iso_conn_unlock(c) spin_unlock(&(c)->lock) static void iso_sock_close(struct sock *sk); static void iso_sock_kill(struct sock *sk); /* ----- ISO socket info ----- */ #define iso_pi(sk) ((struct iso_pinfo *)sk) #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) #define EIR_BAA_SERVICE_UUID 0x1851 /* iso_pinfo flags values */ enum { BT_SK_BIG_SYNC, BT_SK_PA_SYNC, }; struct iso_pinfo { struct bt_sock bt; bdaddr_t src; __u8 src_type; bdaddr_t dst; __u8 dst_type; __u8 bc_sid; __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; __u8 base[BASE_MAX_LENGTH]; struct iso_conn *conn; }; static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT (HZ * 40) #define ISO_DISCONN_TIMEOUT (HZ * 2) static void iso_conn_free(struct kref *ref) { struct iso_conn *conn = container_of(ref, struct iso_conn, ref); BT_DBG("conn %p", conn); if (conn->sk) iso_pi(conn->sk)->conn = NULL; if (conn->hcon) { conn->hcon->iso_data = NULL; hci_conn_drop(conn->hcon); } /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); kfree(conn); } static void iso_conn_put(struct iso_conn *conn) { if (!conn) return; BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); kref_put(&conn->ref, iso_conn_free); } static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn) { if (!conn) return NULL; BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); if (!kref_get_unless_zero(&conn->ref)) return NULL; return conn; } static struct sock *iso_sock_hold(struct iso_conn *conn) { if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk)) return NULL; sock_hold(conn->sk); return conn->sk; } static void iso_sock_timeout(struct work_struct *work) { struct iso_conn *conn = container_of(work, struct iso_conn, timeout_work.work); struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void iso_sock_set_timer(struct sock *sk, long timeout) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout); } static void iso_sock_clear_timer(struct sock *sk) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); } /* ---- ISO connections ---- */ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; conn = iso_conn_hold_unless_zero(conn); if (conn) { if (!conn->hcon) { iso_conn_lock(conn); conn->hcon = hcon; iso_conn_unlock(conn); } iso_conn_put(conn); return conn; } conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return NULL; kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout); hcon->iso_data = conn; conn->hcon = hcon; conn->tx_sn = 0; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. Must be called on the locked socket. */ static void iso_chan_del(struct sock *sk, int err) { struct iso_conn *conn; struct sock *parent; conn = iso_pi(sk)->conn; iso_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { iso_conn_lock(conn); conn->sk = NULL; iso_conn_unlock(conn); iso_conn_put(conn); } sk->sk_state = BT_CLOSED; sk->sk_err = err; parent = bt_sk(sk)->parent; if (parent) { bt_accept_unlink(sk); parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } sock_set_flag(sk, SOCK_ZAPPED); } static void iso_conn_del(struct hci_conn *hcon, int err) { struct iso_conn *conn = hcon->iso_data; struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) { iso_conn_put(conn); return; } lock_sock(sk); iso_sock_clear_timer(sk); iso_chan_del(sk, err); release_sock(sk); sock_put(sk); } static int __iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); if (iso_pi(sk)->conn == conn && conn->sk == sk) return 0; if (conn->sk) { BT_ERR("conn->sk already set"); return -EBUSY; } iso_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); return 0; } static int iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { int err; iso_conn_lock(conn); err = __iso_chan_add(conn, sk, parent); iso_conn_unlock(conn); return err; } static inline u8 le_addr_type(u8 bdaddr_type) { if (bdaddr_type == BDADDR_LE_PUBLIC) return ADDR_LE_DEV_PUBLIC; else return ADDR_LE_DEV_RANDOM; } static int iso_connect_bis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR", &iso_pi(sk)->src); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.bcast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_connect_cis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) { if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) return &iso_pi(sk)->conn->hcon->iso_qos; return &iso_pi(sk)->qos; } static int iso_send_frame(struct sock *sk, struct sk_buff *skb) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); struct hci_iso_data_hdr *hdr; int len = 0; BT_DBG("sk %p len %d", sk, skb->len); if (skb->len > qos->ucast.out.sdu) return -EMSGSIZE; len = skb->len; /* Push ISO data header */ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE); hdr->sn = cpu_to_le16(conn->tx_sn++); hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); if (sk->sk_state == BT_CONNECTED) hci_send_iso(conn->hcon, skb); else len = -ENOTCONN; return len; } static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) { struct sock *sk; iso_conn_lock(conn); sk = conn->sk; iso_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->dst, dst)) continue; if (!bacmp(&iso_pi(sk)->src, src)) return sk; } return NULL; } static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, __u8 sid) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->src, ba)) continue; if (bacmp(&iso_pi(sk)->dst, bc)) continue; if (iso_pi(sk)->bc_sid == sid) return sk; } return NULL; } /* Find socket in given state: * source bdaddr (Unicast) * destination bdaddr (Broadcast only) * match func - pass NULL to ignore * match func data - pass -1 to ignore * Returns closest match. */ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != state) continue; /* Match Broadcast destination */ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) continue; /* Use Match function if provided */ if (match && !match(sk, data)) continue; /* Exact match. */ if (!bacmp(&iso_pi(sk)->src, src)) { sock_hold(sk); break; } /* Closest match */ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) { if (sk1) sock_put(sk1); sk1 = sk; sock_hold(sk1); } } if (sk && sk1) sock_put(sk1); read_unlock(&iso_sk_list.lock); return sk ? sk : sk1; } static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src, bdaddr_t *dst, uint8_t big) { struct sock *sk = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (match_sk == sk) continue; /* Look for sockets that have already been * connected to the BIG */ if (sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT) continue; /* Match Broadcast destination */ if (bacmp(&iso_pi(sk)->dst, dst)) continue; /* Match BIG handle */ if (iso_pi(sk)->qos.bcast.big != big) continue; /* Match source address */ if (bacmp(&iso_pi(sk)->src, src)) continue; sock_hold(sk); break; } read_unlock(&iso_sk_list.lock); return sk; } static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); iso_conn_put(iso_pi(sk)->conn); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void iso_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); } /* If listening socket has a hcon, properly disconnect it */ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) { iso_sock_disconn(parent); return; } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void iso_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || sock_flag(sk, SOCK_DEAD)) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void iso_sock_disconn(struct sock *sk) { struct sock *bis_sk; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) { bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->qos.bcast.big); /* If there are any other connected sockets for the * same BIG, just delete the sk and leave the bis * hcon active, in case later rebinding is needed. */ if (bis_sk) { hcon->state = BT_OPEN; hcon->iso_data = NULL; iso_pi(sk)->conn->hcon = NULL; iso_sock_clear_timer(sk); iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); sock_put(bis_sk); return; } } sk->sk_state = BT_DISCONN; iso_conn_lock(iso_pi(sk)->conn); hci_conn_drop(iso_pi(sk)->conn->hcon); iso_pi(sk)->conn->hcon = NULL; iso_conn_unlock(iso_pi(sk)->conn); } static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: iso_sock_cleanup_listen(sk); break; case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_CONNECT2: if (iso_pi(sk)->conn->hcon && (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags))) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: iso_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { iso_sock_clear_timer(sk); lock_sock(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); } static void iso_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto iso_proto = { .name = "ISO", .owner = THIS_MODULE, .obj_size = sizeof(struct iso_pinfo) }; #define DEFAULT_IO_QOS \ { \ .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ .phy = BT_ISO_PHY_2M, \ .rtn = 2u, \ } static struct bt_iso_qos default_qos = { .bcast = { .big = BT_ISO_QOS_BIG_UNSET, .bis = BT_ISO_QOS_BIS_UNSET, .sync_factor = 0x01, .packing = 0x00, .framing = 0x00, .in = DEFAULT_IO_QOS, .out = DEFAULT_IO_QOS, .encryption = 0x00, .bcode = {0x00}, .options = 0x00, .skip = 0x0000, .sync_timeout = BT_ISO_SYNC_TIMEOUT, .sync_cte_type = 0x00, .mse = 0x00, .timeout = BT_ISO_SYNC_TIMEOUT, }, }; static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; iso_pi(sk)->qos = default_qos; iso_pi(sk)->sync_handle = -1; bt_sock_link(&iso_sk_list, sk); return sk; } static int iso_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &iso_sock_ops; sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; iso_sock_init(sk, NULL); return 0; } static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int i; BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid, sa->iso_bc->bc_num_bis); if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr); /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) return -EINVAL; iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; if (sa->iso_bc->bc_sid > 0x0f) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) return -EINVAL; iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) return -EINVAL; memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); return 0; } static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { err = -EINVAL; goto done; } if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; goto done; } iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) { err = -EINVAL; goto done; } memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); done: return err; } static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type); if (!addr || addr_len < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow the user to bind a PA sync socket to a number * of BISes to sync to. */ if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) && test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { err = iso_sock_bind_pa_sk(sk, sa, addr_len); goto done; } if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) { err = -EINVAL; goto done; } bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr); iso_pi(sk)->src_type = sa->iso_bdaddr_type; /* Check for Broadcast address */ if (addr_len > sizeof(*sa)) { err = iso_sock_bind_bc(sock, addr, addr_len); if (err) goto done; } sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) return -EINVAL; lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) return err; lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } release_sock(sk); return err; } static int iso_listen_bis(struct sock *sk) { struct hci_dev *hdev; int err = 0; struct iso_conn *conn; struct hci_conn *hcon; BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); if (err) return err; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); lock_sock(sk); /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } err = iso_chan_add(conn, sk, NULL); if (err) { hci_conn_drop(hcon); goto unlock; } unlock: release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_listen_cis(struct sock *sk) { int err = 0; BT_DBG("%pMR", &iso_pi(sk)->src); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); return err; } static int iso_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); sock_hold(sk); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { err = iso_listen_cis(sk); } else { /* Drop sock lock to avoid potential * deadlock with the hdev lock. */ release_sock(sk); err = iso_listen_bis(sk); lock_sock(sk); } if (err) goto done; sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; done: release_sock(sk); sock_put(sk); return err; } static int iso_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; /* Use explicit nested locking to avoid lockdep warnings generated * because the parent socket and the child socket are locked on the * same thread. */ lock_sock_nested(sk, SINGLE_DEPTH_NESTING); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } return sizeof(struct sockaddr_iso); } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; size_t mtu; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { release_sock(sk); return -ENOTCONN; } mtu = iso_pi(sk)->conn->hcon->mtu; release_sock(sk); skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0); if (IS_ERR(skb)) return PTR_ERR(skb); len -= skb->len; BT_DBG("skb %p len %d", sk, skb->len); /* Continuation fragments */ frag = &skb_shinfo(skb)->frag_list; while (len) { struct sk_buff *tmp; tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0); if (IS_ERR(tmp)) { kfree_skb(skb); return PTR_ERR(tmp); } *frag = tmp; len -= tmp->len; skb->len += tmp->len; skb->data_len += tmp->len; BT_DBG("frag %p len %d", *frag, tmp->len); frag = &(*frag)->next; } lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = iso_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void iso_conn_defer_accept(struct hci_conn *conn) { struct hci_cp_le_accept_cis cp; struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp); } static void iso_conn_big_sync(struct sock *sk) { int err; struct hci_dev *hdev; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return; /* hci_le_big_create_sync requires hdev lock to be held, since * it enqueues the HCI LE BIG Create Sync command via * hci_cmd_sync_queue_once, which checks hdev flags that might * change. */ hci_dev_lock(hdev); lock_sock(sk); if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); } release_sock(sk); hci_dev_unlock(hdev); } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); bool early_ret = false; int err = 0; BT_DBG("sk %p", sk); if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sock_hold(sk); lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: if (test_bit(BT_SK_PA_SYNC, &pi->flags)) { release_sock(sk); iso_conn_big_sync(sk); lock_sock(sk); sk->sk_state = BT_LISTEN; } else { iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; } early_ret = true; break; case BT_CONNECTED: if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { release_sock(sk); iso_conn_big_sync(sk); lock_sock(sk); sk->sk_state = BT_LISTEN; early_ret = true; } break; case BT_CONNECT: release_sock(sk); err = iso_connect_cis(sk); lock_sock(sk); early_ret = true; break; default: break; } release_sock(sk); sock_put(sk); if (early_ret) return err; } return bt_sock_recvmsg(sock, msg, len, flags); } static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ if (!qos->phy && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) return false; if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; if (qos->phy > BT_ISO_PHY_ANY) return false; return true; } static bool check_ucast_qos(struct bt_iso_qos *qos) { if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) return false; if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) return false; if (qos->ucast.sca > 0x07) return false; if (qos->ucast.packing > 0x01) return false; if (qos->ucast.framing > 0x01) return false; if (!check_io_qos(&qos->ucast.in)) return false; if (!check_io_qos(&qos->ucast.out)) return false; return true; } static bool check_bcast_qos(struct bt_iso_qos *qos) { if (!qos->bcast.sync_factor) qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; if (qos->bcast.framing > 0x01) return false; if (!check_io_qos(&qos->bcast.in)) return false; if (!check_io_qos(&qos->bcast.out)) return false; if (qos->bcast.encryption > 0x01) return false; if (qos->bcast.options > 0x07) return false; if (qos->bcast.skip > 0x01f3) return false; if (!qos->bcast.sync_timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; if (qos->bcast.sync_cte_type > 0x1f) return false; if (qos->bcast.mse > 0x1f) return false; if (!qos->bcast.timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; return true; } static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_PKT_STATUS: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2 && (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) || sk->sk_state != BT_CONNECTED)) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&qos, sizeof(qos), optval, optlen); if (err) break; iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; break; case BT_ISO_BASE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } if (optlen > sizeof(iso_pi(sk)->base)) { err = -EINVAL; break; } err = copy_safe_from_sockptr(iso_pi(sk)->base, optlen, optval, optlen); if (err) break; iso_pi(sk)->base_len = optlen; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_iso_qos *qos; u8 base_len; u8 *base; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_ISO_QOS: qos = iso_sock_get_qos(sk); len = min_t(unsigned int, len, sizeof(*qos)); if (copy_to_user(optval, qos, len)) err = -EFAULT; break; case BT_ISO_BASE: if (sk->sk_state == BT_CONNECTED && !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len; base = iso_pi(sk)->conn->hcon->le_per_adv_data; } else { base_len = iso_pi(sk)->base_len; base = iso_pi(sk)->base; } len = min_t(unsigned int, len, base_len); if (copy_to_user(optval, base, len)) err = -EFAULT; if (put_user(len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; sock_hold(sk); lock_sock(sk); switch (how) { case SHUT_RD: if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; sk->sk_shutdown |= RCV_SHUTDOWN; break; case SHUT_WR: if (sk->sk_shutdown & SEND_SHUTDOWN) goto unlock; sk->sk_shutdown |= SEND_SHUTDOWN; break; case SHUT_RDWR: if (sk->sk_shutdown & SHUTDOWN_MASK) goto unlock; sk->sk_shutdown |= SHUTDOWN_MASK; break; } iso_sock_clear_timer(sk); __iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); unlock: release_sock(sk); sock_put(sk); return err; } static int iso_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); iso_sock_kill(sk); return err; } static void iso_sock_ready(struct sock *sk) { BT_DBG("sk %p", sk); if (!sk) return; lock_sock(sk); iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } static bool iso_match_big(struct sock *sk, void *data) { struct hci_evt_le_big_sync_estabilished *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } static bool iso_match_big_hcon(struct sock *sk, void *data) { struct hci_conn *hcon = data; return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big; } static bool iso_match_pa_sync_flag(struct sock *sk, void *data) { return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; struct hci_ev_le_big_sync_estabilished *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; BT_DBG("conn %p", conn); if (sk) { iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { /* A BIS slave hcon is notified to the ISO layer * after the Command Complete for the LE Setup * ISO Data Path command is received. Get the * parent socket that matches the hcon BIG handle. */ parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_big_hcon, hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABILISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_pa_sync_flag, NULL); if (!parent && ev) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_big, ev); } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_sid, ev2); } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { ev3 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); } if (!parent) parent = iso_get_sock(&hcon->src, BDADDR_ANY, BT_LISTEN, NULL, NULL); if (!parent) return; lock_sock(parent); sk = iso_sock_alloc(sock_net(parent), NULL, BTPROTO_ISO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return; } iso_sock_init(sk, parent); bacpy(&iso_pi(sk)->src, &hcon->src); /* Convert from HCI to three-value type */ if (hcon->src_type == ADDR_LE_DEV_PUBLIC) iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; else iso_pi(sk)->src_type = BDADDR_LE_RANDOM; /* If hcon has no destination address (BDADDR_ANY) it means it * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using * the parent socket destination address. */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = iso_pi(parent)->dst_type; } if (ev3) { iso_pi(sk)->qos = iso_pi(parent)->qos; hcon->iso_qos = iso_pi(sk)->qos; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } bacpy(&iso_pi(sk)->dst, &hcon->dst); iso_pi(sk)->dst_type = hcon->dst_type; iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; sk->sk_error_report(sk); } if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sock_put(parent); } } static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; return ev->sid == iso_pi(sk)->bc_sid; } static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) { struct hci_ev_le_per_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } /* ----- ISO interface with lower layer (HCI) ----- */ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); /* Broadcast receiver requires handling of some events before it can * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific * SID to listen to and once sync is estabilished its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored * in iso_pi(sk)->base so it can be passed up to user, in the case of a * broadcast sink. */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); goto done; } ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Check if BIGInfo report has already been handled */ sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED, iso_match_sync_handle, ev2); if (sk) { sock_put(sk); sk = NULL; goto done; } /* Try to get PA sync socket, if it exists */ sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2, iso_match_sync_handle, ev2); if (!sk) sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle, ev2); if (sk) { int err; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); sock_put(sk); sk = NULL; } } } goto done; } ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) { size_t base_len = 0; u8 *base; struct hci_conn *hcon; sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; hcon = iso_pi(sk)->conn->hcon; if (!hcon) goto done; if (ev3->data_status == LE_PA_DATA_TRUNCATED) { /* The controller was unable to retrieve PA data. */ memset(hcon->le_per_adv_data, 0, HCI_MAX_PER_AD_TOT_LEN); hcon->le_per_adv_data_len = 0; hcon->le_per_adv_data_offset = 0; goto done; } if (hcon->le_per_adv_data_offset + ev3->length > HCI_MAX_PER_AD_TOT_LEN) goto done; memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset, ev3->data, ev3->length); hcon->le_per_adv_data_offset += ev3->length; if (ev3->data_status == LE_PA_DATA_COMPLETE) { /* All PA data has been received. */ hcon->le_per_adv_data_len = hcon->le_per_adv_data_offset; hcon->le_per_adv_data_offset = 0; /* Extract BASE */ base = eir_get_service_data(hcon->le_per_adv_data, hcon->le_per_adv_data_len, EIR_BAA_SERVICE_UUID, &base_len); if (!base || base_len > BASE_MAX_LENGTH) goto done; memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. */ hcon->le_per_adv_data_len = 0; } } else { sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY, BT_LISTEN, NULL, NULL); } done: if (!sk) return 0; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; sock_put(sk); return HCI_LM_ACCEPT; } static bool iso_match(struct hci_conn *hcon) { return hcon->type == ISO_LINK || hcon->type == LE_LINK; } static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != ISO_LINK) { if (hcon->type != LE_LINK) return; /* Check if LE link has failed */ if (status) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &hcon->link_list, list) iso_conn_del(link->conn, bt_to_errno(status)); return; } /* Create CIS if pending */ hci_le_create_cis_pending(hcon->hdev); return; } BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection * into the accept queue of the listening socket and wake up * userspace, to inform the user about the event. */ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); if (conn) iso_conn_ready(conn); } else { iso_conn_del(hcon, bt_to_errno(status)); } } static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != ISO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); iso_conn_del(hcon, bt_to_errno(reason)); } void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct iso_conn *conn = hcon->iso_data; __u16 pb, ts, len; if (!conn) goto drop; pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts); switch (pb) { case ISO_START: case ISO_SINGLE: if (conn->rx_len) { BT_ERR("Unexpected start frame (len %d)", skb->len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; } if (ts) { struct hci_iso_ts_data_hdr *hdr; /* TODO: add timestamp to the packet? */ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len, skb->len, flags); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; iso_recv_frame(conn, skb); return; } if (pb == ISO_SINGLE) { BT_ERR("Frame malformed (len %d, expected len %d)", skb->len, len); goto drop; } if (skb->len > len) { BT_ERR("Frame is too long (len %d, expected len %d)", skb->len, len); goto drop; } /* Allocate skb for the complete frame (with header) */ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); if (!conn->rx_skb) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; break; case ISO_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); if (!conn->rx_len) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); goto drop; } if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; goto drop; } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; return; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { struct sk_buff *rx_skb = conn->rx_skb; /* Complete frame received. iso_recv_frame * takes ownership of the skb so set the global * rx_skb pointer to NULL first. */ conn->rx_skb = NULL; iso_recv_frame(conn, rx_skb); } break; } drop: kfree_skb(skb); } static struct hci_cb iso_cb = { .name = "ISO", .match = iso_match, .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; static int iso_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src, &iso_pi(sk)->dst, sk->sk_state); } read_unlock(&iso_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(iso_debugfs); static struct dentry *iso_debugfs; static const struct proto_ops iso_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = iso_sock_release, .bind = iso_sock_bind, .connect = iso_sock_connect, .listen = iso_sock_listen, .accept = iso_sock_accept, .getname = iso_sock_getname, .sendmsg = iso_sock_sendmsg, .recvmsg = iso_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = iso_sock_shutdown, .setsockopt = iso_sock_setsockopt, .getsockopt = iso_sock_getsockopt }; static const struct net_proto_family iso_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = iso_sock_create, }; static bool iso_inited; bool iso_enabled(void) { return iso_inited; } int iso_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); if (iso_inited) return -EALREADY; err = proto_register(&iso_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops); if (err < 0) { BT_ERR("ISO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create ISO proc file"); bt_sock_unregister(BTPROTO_ISO); goto error; } BT_INFO("ISO socket layer initialized"); hci_register_cb(&iso_cb); if (!IS_ERR_OR_NULL(bt_debugfs)) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); iso_inited = true; return 0; error: proto_unregister(&iso_proto); return err; } int iso_exit(void) { if (!iso_inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); debugfs_remove(iso_debugfs); iso_debugfs = NULL; hci_unregister_cb(&iso_cb); bt_sock_unregister(BTPROTO_ISO); proto_unregister(&iso_proto); iso_inited = false; return 0; }
216 20648 658 20889 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } #endif
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> */ #include <linux/module.h> #include <linux/ipc.h> #include <linux/nsproxy.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/ipc_namespace.h> #include <linux/msg.h> #include <linux/slab.h> #include <linux/cred.h> #include "util.h" static int proc_ipc_dointvec_minmax_orphans(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, shm_rmid_forced); int err; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; if (ns->shm_rmid_forced) shm_destroy_orphaned(ns); return err; } static int proc_ipc_auto_msgmni(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = &dummy; if (write) pr_info_once("writing to auto_msgmni has no effect"); return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } static int proc_ipc_sem_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; semmni = ns->sem_ctls[3]; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. */ if (ret) ns->sem_ctls[3] = semmni; return ret; } int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; static struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "shm_rmid_forced", .data = &init_ipc_ns.shm_rmid_forced, .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "auto_msgmni", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof(int), .mode = 0644, .proc_handler = proc_ipc_sem_dointvec, }, #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, #endif }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) { return &current->nsproxy->ipc_ns->ipc_set; } static int set_is_seen(struct ctl_table_set *set) { return &current->nsproxy->ipc_ns->ipc_set == set; } static void ipc_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; #ifdef CONFIG_CHECKPOINT_RESTORE struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || (table->data == &ns->ids[IPC_MSG_IDS].next_id) || (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && checkpoint_restore_ns_capable(ns->user_ns)) mode = 0666; else #endif { kuid_t ns_root_uid; kgid_t ns_root_gid; ipc_set_ownership(head, &ns_root_uid, &ns_root_gid); if (uid_eq(current_euid(), ns_root_uid)) mode >>= 6; else if (in_egroup_p(ns_root_gid)) mode >>= 3; } mode &= 7; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = ipc_permissions, .set_ownership = ipc_set_ownership, }; bool setup_ipc_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { if (tbl[i].data == &init_ipc_ns.shm_ctlmax) tbl[i].data = &ns->shm_ctlmax; else if (tbl[i].data == &init_ipc_ns.shm_ctlall) tbl[i].data = &ns->shm_ctlall; else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) tbl[i].data = &ns->shm_ctlmni; else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) tbl[i].data = &ns->shm_rmid_forced; else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) tbl[i].data = &ns->msg_ctlmax; else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) tbl[i].data = &ns->msg_ctlmni; else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) tbl[i].data = &ns->msg_ctlmnb; else if (tbl[i].data == &init_ipc_ns.sem_ctls) tbl[i].data = &ns->sem_ctls; #ifdef CONFIG_CHECKPOINT_RESTORE else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; #endif else tbl[i].data = NULL; } ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl, ARRAY_SIZE(ipc_sysctls)); } if (!ns->ipc_sysctls) { kfree(tbl); retire_sysctl_set(&ns->ipc_set); return false; } return true; } void retire_ipc_sysctls(struct ipc_namespace *ns) { const struct ctl_table *tbl; tbl = ns->ipc_sysctls->ctl_table_arg; unregister_sysctl_table(ns->ipc_sysctls); retire_sysctl_set(&ns->ipc_set); kfree(tbl); } static int __init ipc_sysctl_init(void) { if (!setup_ipc_sysctls(&init_ipc_ns)) { pr_warn("ipc sysctl registration failed\n"); return -ENOMEM; } return 0; } device_initcall(ipc_sysctl_init); static int __init ipc_mni_extend(char *str) { ipc_mni = IPCMNI_EXTEND; ipc_mni_shift = IPCMNI_EXTEND_SHIFT; ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; pr_info("IPCMNI extended to %d.\n", ipc_mni); return 0; } early_param("ipcmni_extend", ipc_mni_extend);
4 4 4 1 3 1 2 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> * Add Encapsulation support * */ #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/gro.h> static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev)) goto drop; } if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) goto drop; return 0; drop: kfree_skb(skb); return NET_RX_DROP; } int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return -iph->protocol; #endif __skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); if (xo && (xo->flags & XFRM_GRO)) { /* The full l2 header needs to be preserved so that re-injecting the packet at l2 * works correctly in the presence of vlan tags. */ skb_mac_header_rebuild_full(skb, xo->orig_mac_len); skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct iphdr *iph; int iphlen, len; __u8 *udpdata; __be32 *udpdata32; u16 encap_type; encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; /* If this is a paged skb, make sure we pull up * whatever data we need to look at. */ len = skb->len - sizeof(struct udphdr); if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) return 1; /* Now we can get the pointers */ uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else /* Must be an IKE packet.. pass it through */ return 1; break; } /* At this point we are sure that this is an ESPinUDP packet, * so we need to remove 'len' bytes from the packet (the UDP * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) return -EINVAL; /* Now we can update and verify the packet length... */ iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { /* packet is too small!?! */ return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ if (pull) { __skb_pull(skb, len); skb_reset_transport_header(skb); } else { skb_set_transport_header(skb, len); } /* process ESP */ return 0; } /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the * IPsec xfrm input. * Returns 0 if skb passed to xfrm or was dropped. * Returns >0 if skb should be passed to UDP. * Returns <0 if skb should be resubmitted (-ret is protocol) */ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { int ret; ret = __xfrm4_udp_encap_rcv(sk, skb, true); if (!ret) return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, udp_sk(sk)->encap_type); if (ret < 0) { kfree_skb(skb); return 0; } return ret; } EXPORT_SYMBOL(xfrm4_udp_encap_rcv); struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; int ret; offset = offset - sizeof(struct udphdr); if (!pskb_pull(skb, offset)) return NULL; rcu_read_lock(); ops = rcu_dereference(inet_offloads[IPPROTO_ESP]); if (!ops || !ops->callbacks.gro_receive) goto out; ret = __xfrm4_udp_encap_rcv(sk, skb, false); if (ret) goto out; skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); rcu_read_unlock(); return pp; out: rcu_read_unlock(); skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); } EXPORT_SYMBOL(xfrm4_rcv);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 Google LLC. */ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H #include <linux/sched.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/lsm_hooks.h> #ifdef CONFIG_BPF_LSM #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_lsm_##NAME(__VA_ARGS__); #include <linux/lsm_hook_defs.h> #undef LSM_HOOK struct bpf_storage_blob { struct bpf_local_storage __rcu *storage; }; extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return false; } static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) { return false; } static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { return NULL; } static inline void bpf_inode_storage_free(struct inode *inode) { } static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { } static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range) { return -EOPNOTSUPP; } #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */
146 3 3 3 12 12 12 20 1 1 1 16 1 1 1 1 1 1 1 2 1 9 1 4 13 2 12 1 7 7 5 2 7 7 7 881 882 880 175 139 9 9 13 12 13 5 9 13 13 13 13 3 6 2 2 3 6 2 2 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_mirred.c packet mirroring and redirect actions * * Authors: Jamal Hadi Salim (2002-4) * * TODO: Add ingress support (and socket redirect support) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/dst.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> #include <net/tc_wrapper.h> static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: return true; default: BUG(); } } static bool tcf_mirred_can_reinsert(int action) { switch (action) { case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: return true; } return false; } static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) { return rcu_dereference_protected(m->tcfm_dev, lockdep_is_held(&m->tcf_lock)); } static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); spin_unlock(&mirred_list_lock); /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); netdev_put(dev, &m->tcfm_dev_tracker); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, [TCA_MIRRED_BLOCKID] = NLA_POLICY_MIN(NLA_U32, 1), }; static struct tc_action_ops act_mirred_ops; static void tcf_mirred_replace_dev(struct tcf_mirred *m, struct net_device *ndev) { struct net_device *odev; odev = rcu_replace_pointer(m->tcfm_dev, ndev, lockdep_is_held(&m->tcf_lock)); netdev_put(odev, &m->tcfm_dev_tracker); } static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; struct tc_mirred *parm; struct tcf_mirred *m; bool exists = false; int ret, err; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; } ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; if (!tb[TCA_MIRRED_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; } parm = nla_data(tb[TCA_MIRRED_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Cannot specify Block ID and dev simultaneously"); if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (!exists) { if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) { tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Must specify device or block"); return -EINVAL; } ret = tcf_idr_create_from_flags(tn, index, est, a, &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } m = to_mirred(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&m->tcfm_list); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&m->tcf_lock); if (parm->ifindex) { struct net_device *ndev; ndev = dev_get_by_index(net, parm->ifindex); if (!ndev) { spin_unlock_bh(&m->tcf_lock); err = -ENODEV; goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(ndev); tcf_mirred_replace_dev(m, ndev); netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC); m->tcfm_mac_header_xmit = mac_header_xmit; m->tcfm_blockid = 0; } else if (tb[TCA_MIRRED_BLOCKID]) { tcf_mirred_replace_dev(m, NULL); m->tcfm_mac_header_xmit = false; m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]); } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); m->tcfm_eaction = parm->eaction; spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); else if (!at_ingress) err = netif_rx(skb); else err = netif_receive_skb(skb); return err; } static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, struct net_device *dev, const bool m_mac_header_xmit, int m_eaction, int retval) { struct sk_buff *skb_to_send = skb; bool want_ingress; bool is_redirect; bool expects_nh; bool at_ingress; bool dont_clone; int mac_len; bool at_nh; int err; is_redirect = tcf_mirred_is_act_redirect(m_eaction); if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto err_cant_do; } /* we could easily avoid the clone only if called by ingress and clsact; * since we can't easily detect the clsact caller, skip clone only for * ingress - that covers the TC S/W datapath. */ at_ingress = skb_at_tc_ingress(skb); dont_clone = skb_at_tc_ingress(skb) && is_redirect && tcf_mirred_can_reinsert(retval); if (!dont_clone) { skb_to_send = skb_clone(skb, GFP_ATOMIC); if (!skb_to_send) goto err_cant_do; } want_ingress = tcf_mirred_act_wants_ingress(m_eaction); /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb_to_send); if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ skb_dst_drop(skb_to_send); expects_nh = want_ingress || !m_mac_header_xmit; at_nh = skb->data == skb_network_header(skb); if (at_nh != expects_nh) { mac_len = at_ingress ? skb->mac_len : skb_network_offset(skb); if (expects_nh) { /* target device/action expect data at nh */ skb_pull_rcsum(skb_to_send, mac_len); } else { /* target device/action expect data at mac */ skb_push_rcsum(skb_to_send, mac_len); } } skb_to_send->skb_iif = skb->dev->ifindex; skb_to_send->dev = dev; if (is_redirect) { if (skb == skb_to_send) retval = TC_ACT_CONSUMED; skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress); err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } else { err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } if (err) tcf_action_inc_overlimit_qstats(&m->common); return retval; err_cant_do: if (is_redirect) retval = TC_ACT_SHOT; tcf_action_inc_overlimit_qstats(&m->common); return retval; } static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev_prev = NULL; struct net_device *dev = NULL; unsigned long index; int mirred_eaction; mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ? TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; if (!dev_prev) goto assign_prev; tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev), mirred_eaction, retval); assign_prev: dev_prev = dev; } if (dev_prev) return tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev_prev), m_eaction, retval); return retval; } static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev = NULL; unsigned long index; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; tcf_mirred_to_dev(skb, m, dev, dev_is_mac_header_xmit(dev), m_eaction, retval); } return retval; } static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, const u32 blockid, struct tcf_result *res, int retval) { const u32 exception_ifindex = skb->dev->ifindex; struct tcf_block *block; bool is_redirect; int m_eaction; m_eaction = READ_ONCE(m->tcfm_eaction); is_redirect = tcf_mirred_is_act_redirect(m_eaction); /* we are already under rcu protection, so can call block lookup * directly. */ block = tcf_block_lookup(dev_net(skb->dev), blockid); if (!block || xa_empty(&block->ports)) { tcf_action_inc_overlimit_qstats(&m->common); return retval; } if (is_redirect) return tcf_blockcast_redir(skb, m, block, m_eaction, exception_ifindex, retval); /* If it's not redirect, it is mirror */ return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex, retval); } TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); unsigned int nest_level; bool m_mac_header_xmit; struct net_device *dev; int m_eaction; u32 blockid; nest_level = __this_cpu_inc_return(mirred_nest_level); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); retval = TC_ACT_SHOT; goto dec_nest_level; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); if (blockid) { retval = tcf_blockcast(skb, m, blockid, res, retval); goto dec_nest_level; } dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); goto dec_nest_level; } m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); dec_nest_level: __this_cpu_dec(mirred_nest_level); return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct net_device *dev; struct tcf_t t; u32 blockid; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; opt.eaction = m->tcfm_eaction; dev = tcf_mirred_dev_dereference(m); if (dev) opt.ifindex = dev->ifindex; if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; blockid = m->tcfm_blockid; if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { spin_lock_bh(&m->tcf_lock); if (tcf_mirred_dev_dereference(m) == dev) { netdev_put(dev, &m->tcfm_dev_tracker); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } spin_unlock_bh(&m->tcf_lock); } spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; } static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; static void tcf_mirred_dev_put(void *priv) { struct net_device *dev = priv; dev_put(dev); } static struct net_device * tcf_mirred_get_dev(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(m->tcfm_dev); if (dev) { dev_hold(dev); *destructor = tcf_mirred_dev_put; } rcu_read_unlock(); return dev; } static size_t tcf_mirred_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_mirred)); } static void tcf_offload_mirred_get_dev(struct flow_action_entry *entry, const struct tc_action *act) { entry->dev = act->ops->get_dev(act, &entry->destructor); if (!entry->dev) return; entry->destructor_priv = entry->dev; } static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_mirred_egress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_egress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported mirred offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_mirred_egress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT; else if (is_tcf_mirred_egress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED; else if (is_tcf_mirred_ingress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT_INGRESS; else if (is_tcf_mirred_ingress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED_INGRESS; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .id = TCA_ID_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, .get_fill_size = tcf_mirred_get_fill_size, .offload_act_setup = tcf_mirred_offload_act_setup, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, }; MODULE_ALIAS_NET_ACT("mirred"); static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_mirred_ops.net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, .exit_batch = mirred_exit_net, .id = &act_mirred_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); MODULE_DESCRIPTION("Device Mirror/redirect actions"); MODULE_LICENSE("GPL"); static int __init mirred_init_module(void) { int err = register_netdevice_notifier(&mirred_device_notifier); if (err) return err; pr_info("Mirror/redirect action on\n"); err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); if (err) unregister_netdevice_notifier(&mirred_device_notifier); return err; } static void __exit mirred_cleanup_module(void) { tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); module_exit(mirred_cleanup_module);
11 21 31 2 2 1 84 31 316 29 14 711 446 1047 1052 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <linux/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message * @reg_inited: bitmap of initialised registers */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; DECLARE_BITMAP(reg_inited, NFT_REG32_NUM); }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_UNSPEC: unspecified, to catch errors * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } struct nft_timeout { u64 timeout; u64 expiration; }; static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) || READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0) return false; return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_table_is_orphan(const struct nft_table *table) { return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == NFT_TABLE_F_PERSIST; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @net: struct net * @table: struct nft_table the object resides in * @msg_type: message type * @seq: netlink sequence number * @flags: modifiers to new request * @report: notify via unicast netlink message * @put_net: net needs to be put * * This is the information common to all objects in the transaction, * this must always be the first member of derived sub-types. */ struct nft_trans { struct list_head list; struct net *net; struct nft_table *table; int msg_type; u32 seq; u16 flags; u8 report:1; u8 put_net:1; }; /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member * @binding_list: list of objects with possible bindings * * This is the base type used by objects that can be bound to a chain. */ struct nft_trans_binding { struct nft_trans nft_trans; struct list_head binding_list; }; struct nft_trans_rule { struct nft_trans nft_trans; struct nft_rule *rule; struct nft_chain *chain; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_container_rule(trans) \ container_of(trans, struct nft_trans_rule, nft_trans) #define nft_trans_rule(trans) \ nft_trans_container_rule(trans)->rule #define nft_trans_flow_rule(trans) \ nft_trans_container_rule(trans)->flow #define nft_trans_rule_id(trans) \ nft_trans_container_rule(trans)->rule_id #define nft_trans_rule_bound(trans) \ nft_trans_container_rule(trans)->bound #define nft_trans_rule_chain(trans) \ nft_trans_container_rule(trans)->chain struct nft_trans_set { struct nft_trans_binding nft_trans_binding; struct list_head list_trans_newset; struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_container_set(t) \ container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans) #define nft_trans_set(trans) \ nft_trans_container_set(trans)->set #define nft_trans_set_id(trans) \ nft_trans_container_set(trans)->set_id #define nft_trans_set_bound(trans) \ nft_trans_container_set(trans)->bound #define nft_trans_set_update(trans) \ nft_trans_container_set(trans)->update #define nft_trans_set_timeout(trans) \ nft_trans_container_set(trans)->timeout #define nft_trans_set_gc_int(trans) \ nft_trans_container_set(trans)->gc_int #define nft_trans_set_size(trans) \ nft_trans_container_set(trans)->size struct nft_trans_chain { struct nft_trans_binding nft_trans_binding; struct nft_chain *chain; char *name; struct nft_stats __percpu *stats; u8 policy; bool update; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_container_chain(t) \ container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans) #define nft_trans_chain(trans) \ nft_trans_container_chain(trans)->chain #define nft_trans_chain_update(trans) \ nft_trans_container_chain(trans)->update #define nft_trans_chain_name(trans) \ nft_trans_container_chain(trans)->name #define nft_trans_chain_stats(trans) \ nft_trans_container_chain(trans)->stats #define nft_trans_chain_policy(trans) \ nft_trans_container_chain(trans)->policy #define nft_trans_chain_bound(trans) \ nft_trans_container_chain(trans)->bound #define nft_trans_chain_id(trans) \ nft_trans_container_chain(trans)->chain_id #define nft_trans_basechain(trans) \ nft_trans_container_chain(trans)->basechain #define nft_trans_chain_hooks(trans) \ nft_trans_container_chain(trans)->hook_list struct nft_trans_table { struct nft_trans nft_trans; bool update; }; #define nft_trans_container_table(trans) \ container_of(trans, struct nft_trans_table, nft_trans) #define nft_trans_table_update(trans) \ nft_trans_container_table(trans)->update enum nft_trans_elem_flags { NFT_TRANS_UPD_TIMEOUT = (1 << 0), NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; struct nft_elem_update { u64 timeout; u64 expiration; u8 flags; }; struct nft_trans_one_elem { struct nft_elem_priv *priv; struct nft_elem_update *update; }; struct nft_trans_elem { struct nft_trans nft_trans; struct nft_set *set; bool bound; unsigned int nelems; struct nft_trans_one_elem elems[] __counted_by(nelems); }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound struct nft_trans_obj { struct nft_trans nft_trans; struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_container_obj(t) \ container_of(t, struct nft_trans_obj, nft_trans) #define nft_trans_obj(trans) \ nft_trans_container_obj(trans)->obj #define nft_trans_obj_newobj(trans) \ nft_trans_container_obj(trans)->newobj #define nft_trans_obj_update(trans) \ nft_trans_container_obj(trans)->update struct nft_trans_flowtable { struct nft_trans nft_trans; struct nft_flowtable *flowtable; struct list_head hook_list; u32 flags; bool update; }; #define nft_trans_container_flowtable(t) \ container_of(t, struct nft_trans_flowtable, nft_trans) #define nft_trans_flowtable(trans) \ nft_trans_container_flowtable(trans)->flowtable #define nft_trans_flowtable_update(trans) \ nft_trans_container_flowtable(trans)->update #define nft_trans_flowtable_hooks(trans) \ nft_trans_container_flowtable(trans)->hook_list #define nft_trans_flowtable_flags(trans) \ nft_trans_container_flowtable(trans)->flags #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; static inline void nft_ctx_update(struct nft_ctx *ctx, const struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWRULE: case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: ctx->chain = nft_trans_rule_chain(trans); break; case NFT_MSG_NEWCHAIN: case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: ctx->chain = nft_trans_chain(trans); break; default: ctx->chain = NULL; break; } ctx->net = trans->net; ctx->table = trans->table; ctx->family = trans->table->family; ctx->report = trans->report; ctx->flags = trans->flags; ctx->seq = trans->seq; } struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head commit_set_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */
11 11 11 11 9 3 3 3 2 2 3 3 3 1 5 11 7 3 3 3 3 1 3 2 2 11 8 3 8 11 2 1 1 1 4 4 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <asm/simd.h> #include "aegis.h" #define AEGIS128_NONCE_SIZE 16 #define AEGIS128_STATE_BLOCKS 5 #define AEGIS128_KEY_SIZE 16 #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 struct aegis_state { union aegis_block blocks[AEGIS128_STATE_BLOCKS]; }; struct aegis_ctx { union aegis_block key; }; static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_simd); static const union aegis_block crypto_aegis_const[2] = { { .words64 = { cpu_to_le64(U64_C(0x0d08050302010100)), cpu_to_le64(U64_C(0x6279e99059372215)), } }, { .words64 = { cpu_to_le64(U64_C(0xf12fc26d55183ddb)), cpu_to_le64(U64_C(0xdd28b57342311120)), } }, }; static bool aegis128_do_simd(void) { #ifdef CONFIG_CRYPTO_AEGIS128_SIMD if (static_branch_likely(&have_simd)) return crypto_simd_usable(); #endif return false; } static void crypto_aegis128_update(struct aegis_state *state) { union aegis_block tmp; unsigned int i; tmp = state->blocks[AEGIS128_STATE_BLOCKS - 1]; for (i = AEGIS128_STATE_BLOCKS - 1; i > 0; i--) crypto_aegis_aesenc(&state->blocks[i], &state->blocks[i - 1], &state->blocks[i]); crypto_aegis_aesenc(&state->blocks[0], &tmp, &state->blocks[0]); } static void crypto_aegis128_update_a(struct aegis_state *state, const union aegis_block *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_aegis_block_xor(&state->blocks[0], msg); } static void crypto_aegis128_update_u(struct aegis_state *state, const void *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_xor(state->blocks[0].bytes, msg, AEGIS_BLOCK_SIZE); } static void crypto_aegis128_init(struct aegis_state *state, const union aegis_block *key, const u8 *iv) { union aegis_block key_iv; unsigned int i; key_iv = *key; crypto_xor(key_iv.bytes, iv, AEGIS_BLOCK_SIZE); state->blocks[0] = key_iv; state->blocks[1] = crypto_aegis_const[1]; state->blocks[2] = crypto_aegis_const[0]; state->blocks[3] = *key; state->blocks[4] = *key; crypto_aegis_block_xor(&state->blocks[3], &crypto_aegis_const[0]); crypto_aegis_block_xor(&state->blocks[4], &crypto_aegis_const[1]); for (i = 0; i < 5; i++) { crypto_aegis128_update_a(state, key, false); crypto_aegis128_update_a(state, &key_iv, false); } } static void crypto_aegis128_ad(struct aegis_state *state, const u8 *src, unsigned int size, bool do_simd) { if (AEGIS_ALIGNED(src)) { const union aegis_block *src_blk = (const union aegis_block *)src; while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_a(state, src_blk, do_simd); size -= AEGIS_BLOCK_SIZE; src_blk++; } } else { while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_u(state, src, do_simd); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; } } } static void crypto_aegis128_wipe_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { memzero_explicit(dst, size); } static void crypto_aegis128_encrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, src_blk, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_u(state, src, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis128_update_a(state, &msg, false); crypto_aegis_block_xor(&msg, &tmp); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_decrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, &tmp, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_a(state, &tmp, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&msg, &tmp); memset(msg.bytes + size, 0, AEGIS_BLOCK_SIZE - size); crypto_aegis128_update_a(state, &msg, false); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_process_ad(struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen, bool do_simd) { struct scatter_walk walk; union aegis_block buf; unsigned int pos = 0; scatterwalk_start(&walk, sg_src); while (assoclen != 0) { unsigned int size = scatterwalk_clamp(&walk, assoclen); unsigned int left = size; void *mapped = scatterwalk_map(&walk); const u8 *src = (const u8 *)mapped; if (pos + size >= AEGIS_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); crypto_aegis128_update_a(state, &buf, do_simd); pos = 0; left -= fill; src += fill; } crypto_aegis128_ad(state, src, left, do_simd); src += left & ~(AEGIS_BLOCK_SIZE - 1); left &= AEGIS_BLOCK_SIZE - 1; } memcpy(buf.bytes + pos, src, left); pos += left; assoclen -= size; scatterwalk_unmap(mapped); scatterwalk_advance(&walk, size); scatterwalk_done(&walk, 0, assoclen); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS_BLOCK_SIZE - pos); crypto_aegis128_update_a(state, &buf, do_simd); } } static __always_inline int crypto_aegis128_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, void (*crypt)(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size)) { int err = 0; while (walk->nbytes) { unsigned int nbytes = walk->nbytes; if (nbytes < walk->total) nbytes = round_down(nbytes, walk->stride); crypt(state, walk->dst.virt.addr, walk->src.virt.addr, nbytes); err = skcipher_walk_done(walk, walk->nbytes - nbytes); } return err; } static void crypto_aegis128_final(struct aegis_state *state, union aegis_block *tag_xor, u64 assoclen, u64 cryptlen) { u64 assocbits = assoclen * 8; u64 cryptbits = cryptlen * 8; union aegis_block tmp; unsigned int i; tmp.words64[0] = cpu_to_le64(assocbits); tmp.words64[1] = cpu_to_le64(cryptbits); crypto_aegis_block_xor(&tmp, &state->blocks[3]); for (i = 0; i < 7; i++) crypto_aegis128_update_a(state, &tmp, false); for (i = 0; i < AEGIS128_STATE_BLOCKS; i++) crypto_aegis_block_xor(tag_xor, &state->blocks[i]); } static int crypto_aegis128_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct aegis_ctx *ctx = crypto_aead_ctx(aead); if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; } static int crypto_aegis128_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize > AEGIS128_MAX_AUTH_SIZE) return -EINVAL; if (authsize < AEGIS128_MIN_AUTH_SIZE) return -EINVAL; return 0; } static int crypto_aegis128_encrypt_generic(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_generic(struct aead_request *req) { static const u8 zeros[AEGIS128_MAX_AUTH_SIZE] = {}; struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); if (unlikely(crypto_memneq(tag.bytes, zeros, authsize))) { /* * From Chapter 4. 'Security Analysis' of the AEGIS spec [0] * * "3. If verification fails, the decrypted plaintext and the * wrong authentication tag should not be given as output." * * [0] https://competitions.cr.yp.to/round3/aegisv11.pdf */ skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); memzero_explicit(&tag, sizeof(tag)); return -EBADMSG; } return 0; } static int crypto_aegis128_encrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_encrypt_generic(req); skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk_simd); crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, 0); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_decrypt_generic(req); scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk_simd); if (unlikely(crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, authsize))) { skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); return -EBADMSG; } return 0; } static struct aead_alg crypto_aegis128_alg_generic = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_generic, .decrypt = crypto_aegis128_decrypt_generic, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_alignmask = 0, .base.cra_priority = 100, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-generic", .base.cra_module = THIS_MODULE, }; static struct aead_alg crypto_aegis128_alg_simd = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_simd, .decrypt = crypto_aegis128_decrypt_simd, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_alignmask = 0, .base.cra_priority = 200, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-simd", .base.cra_module = THIS_MODULE, }; static int __init crypto_aegis128_module_init(void) { int ret; ret = crypto_register_aead(&crypto_aegis128_alg_generic); if (ret) return ret; if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) { ret = crypto_register_aead(&crypto_aegis128_alg_simd); if (ret) { crypto_unregister_aead(&crypto_aegis128_alg_generic); return ret; } static_branch_enable(&have_simd); } return 0; } static void __exit crypto_aegis128_module_exit(void) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) crypto_unregister_aead(&crypto_aegis128_alg_simd); crypto_unregister_aead(&crypto_aegis128_alg_generic); } subsys_initcall(crypto_aegis128_module_init); module_exit(crypto_aegis128_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-generic"); MODULE_ALIAS_CRYPTO("aegis128-simd");
29 27 166 95 18 15 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <linux/tc_act/tc_csum.h> #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> static struct workqueue_struct *nf_flow_offload_add_wq; static struct workqueue_struct *nf_flow_offload_del_wq; static struct workqueue_struct *nf_flow_offload_stats_wq; struct flow_offload_work { struct list_head list; enum flow_cls_command cmd; struct nf_flowtable *flowtable; struct flow_offload *flow; struct work_struct work; }; #define NF_FLOW_DISSECTOR(__match, __type, __field) \ (__match)->dissector.offset[__type] = \ offsetof(struct nf_flow_key, __field) static void nf_flow_rule_lwt_match(struct nf_flow_match *match, struct ip_tunnel_info *tun_info) { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); key->enc_ipv4.src = tun_info->key.u.ipv4.dst; key->enc_ipv4.dst = tun_info->key.u.ipv4.src; if (key->enc_ipv4.src) mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, sizeof(struct in6_addr)); memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src, sizeof(struct in6_addr)); if (memcmp(&key->enc_ipv6.src, &in6addr_any, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.src, 0xff, sizeof(struct in6_addr)); if (memcmp(&key->enc_ipv6.dst, &in6addr_any, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } match->dissector.used_keys |= enc_keys; } static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key, struct flow_dissector_key_vlan *mask, u16 vlan_id, __be16 proto) { key->vlan_id = vlan_id; mask->vlan_id = VLAN_VID_MASK; key->vlan_tpid = proto; mask->vlan_tpid = 0xffff; } static int nf_flow_rule_match(struct nf_flow_match *match, const struct flow_offload_tuple *tuple, struct dst_entry *other_dst) { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; struct ip_tunnel_info *tun_info; bool vlan_encap = false; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); if (other_dst && other_dst->lwtstate) { tun_info = lwt_tun_info(other_dst->lwtstate); nf_flow_rule_lwt_match(match, tun_info); } if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) key->meta.ingress_ifindex = tuple->tc.iifidx; else key->meta.ingress_ifindex = tuple->iifidx; mask->meta.ingress_ifindex = 0xffffffff; if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && tuple->encap[0].proto == htons(ETH_P_8021Q)) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan); nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, tuple->encap[0].id, tuple->encap[0].proto); vlan_encap = true; } if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) && tuple->encap[1].proto == htons(ETH_P_8021Q)) { if (vlan_encap) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN, cvlan); nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan, tuple->encap[1].id, tuple->encap[1].proto); } else { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan); nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, tuple->encap[1].id, tuple->encap[1].proto); } } switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; key->basic.n_proto = htons(ETH_P_IP); key->ipv4.src = tuple->src_v4.s_addr; mask->ipv4.src = 0xffffffff; key->ipv4.dst = tuple->dst_v4.s_addr; mask->ipv4.dst = 0xffffffff; break; case AF_INET6: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; key->basic.n_proto = htons(ETH_P_IPV6); key->ipv6.src = tuple->src_v6; memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src)); key->ipv6.dst = tuple->dst_v6; memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst)); break; default: return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: case IPPROTO_GRE: break; default: return -EOPNOTSUPP; } key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); switch (tuple->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: key->tp.src = tuple->src_port; mask->tp.src = 0xffff; key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); break; } return 0; } static void flow_offload_mangle(struct flow_action_entry *entry, enum flow_action_mangle_base htype, u32 offset, const __be32 *value, const __be32 *mask) { entry->id = FLOW_ACTION_MANGLE; entry->mangle.htype = htype; entry->mangle.offset = offset; memcpy(&entry->mangle.mask, mask, sizeof(u32)); memcpy(&entry->mangle.val, value, sizeof(u32)); } static inline struct flow_action_entry * flow_action_entry_next(struct nf_flow_rule *flow_rule) { int i = flow_rule->rule->action.num_entries++; return &flow_rule->rule->action.entries[i]; } static int flow_offload_eth_src(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); const struct flow_offload_tuple *other_tuple, *this_tuple; struct net_device *dev = NULL; const unsigned char *addr; u32 mask, val; u16 val16; this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: addr = this_tuple->out.h_source; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; dev = dev_get_by_index(net, other_tuple->iifidx); if (!dev) return -ENOENT; addr = dev->dev_addr; break; default: return -EOPNOTSUPP; } mask = ~0xffff0000; memcpy(&val16, addr, 2); val = val16 << 16; flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); mask = ~0xffffffff; memcpy(&val, addr + 2, 4); flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, &val, &mask); dev_put(dev); return 0; } static int flow_offload_eth_dst(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); const struct flow_offload_tuple *other_tuple, *this_tuple; const struct dst_entry *dst_cache; unsigned char ha[ETH_ALEN]; struct neighbour *n; const void *daddr; u32 mask, val; u8 nud_state; u16 val16; this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: ether_addr_copy(ha, this_tuple->out.h_dest); break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; daddr = &other_tuple->src_v4; dst_cache = this_tuple->dst_cache; n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -ENOENT; read_lock_bh(&n->lock); nud_state = n->nud_state; ether_addr_copy(ha, n->ha); read_unlock_bh(&n->lock); neigh_release(n); if (!(nud_state & NUD_VALID)) return -ENOENT; break; default: return -EOPNOTSUPP; } mask = ~0xffffffff; memcpy(&val, ha, 4); flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, &val, &mask); mask = ~0x0000ffff; memcpy(&val16, ha + 4, 2); val = val16; flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); return 0; } static void flow_offload_ipv4_snat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; offset = offsetof(struct iphdr, saddr); break; case FLOW_OFFLOAD_DIR_REPLY: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; offset = offsetof(struct iphdr, daddr); break; default: return; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); } static void flow_offload_ipv4_dnat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; offset = offsetof(struct iphdr, daddr); break; case FLOW_OFFLOAD_DIR_REPLY: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; offset = offsetof(struct iphdr, saddr); break; default: return; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); } static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, unsigned int offset, const __be32 *addr, const __be32 *mask) { struct flow_action_entry *entry; int i; for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, offset + i * sizeof(u32), &addr[i], mask); } } static void flow_offload_ipv6_snat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32; offset = offsetof(struct ipv6hdr, saddr); break; case FLOW_OFFLOAD_DIR_REPLY: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32; offset = offsetof(struct ipv6hdr, daddr); break; default: return; } flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } static void flow_offload_ipv6_dnat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32; offset = offsetof(struct ipv6hdr, daddr); break; case FLOW_OFFLOAD_DIR_REPLY: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32; offset = offsetof(struct ipv6hdr, saddr); break; default: return; } flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } static int flow_offload_l4proto(const struct flow_offload *flow) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; u8 type = 0; switch (protonum) { case IPPROTO_TCP: type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; break; case IPPROTO_UDP: type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; break; default: break; } return type; } static void flow_offload_port_snat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); offset = 0; /* offsetof(struct tcphdr, source); */ port = htonl(port << 16); mask = ~htonl(0xffff0000); break; case FLOW_OFFLOAD_DIR_REPLY: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); offset = 0; /* offsetof(struct tcphdr, dest); */ port = htonl(port); mask = ~htonl(0xffff); break; default: return; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); } static void flow_offload_port_dnat(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); offset = 0; /* offsetof(struct tcphdr, dest); */ port = htonl(port); mask = ~htonl(0xffff); break; case FLOW_OFFLOAD_DIR_REPLY: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); offset = 0; /* offsetof(struct tcphdr, source); */ port = htonl(port << 16); mask = ~htonl(0xffff0000); break; default: return; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); } static void flow_offload_ipv4_checksum(struct net *net, const struct flow_offload *flow, struct nf_flow_rule *flow_rule) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; struct flow_action_entry *entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_CSUM; entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; switch (protonum) { case IPPROTO_TCP: entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; break; case IPPROTO_UDP: entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; break; } } static void flow_offload_redirect(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple, *other_tuple; struct flow_action_entry *entry; struct net_device *dev; int ifindex; this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; ifindex = this_tuple->out.hw_ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; ifindex = other_tuple->iifidx; break; default: return; } dev = dev_get_by_index(net, ifindex); if (!dev) return; entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_REDIRECT; entry->dev = dev; } static void flow_offload_encap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple; struct flow_action_entry *entry; struct dst_entry *dst; this_tuple = &flow->tuplehash[dir].tuple; if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) return; dst = this_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_TUNNEL_ENCAP; entry->tunnel = tun_info; } } } static void flow_offload_decap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *other_tuple; struct flow_action_entry *entry; struct dst_entry *dst; other_tuple = &flow->tuplehash[!dir].tuple; if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) return; dst = other_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_TUNNEL_DECAP; } } } static int nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *other_tuple; const struct flow_offload_tuple *tuple; int i; flow_offload_decap_tunnel(flow, dir, flow_rule); flow_offload_encap_tunnel(flow, dir, flow_rule); if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; tuple = &flow->tuplehash[dir].tuple; for (i = 0; i < tuple->encap_num; i++) { struct flow_action_entry *entry; if (tuple->in_vlan_ingress & BIT(i)) continue; if (tuple->encap[i].proto == htons(ETH_P_8021Q)) { entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_VLAN_POP; } } other_tuple = &flow->tuplehash[!dir].tuple; for (i = 0; i < other_tuple->encap_num; i++) { struct flow_action_entry *entry; if (other_tuple->in_vlan_ingress & BIT(i)) continue; entry = flow_action_entry_next(flow_rule); switch (other_tuple->encap[i].proto) { case htons(ETH_P_PPP_SES): entry->id = FLOW_ACTION_PPPOE_PUSH; entry->pppoe.sid = other_tuple->encap[i].id; break; case htons(ETH_P_8021Q): entry->id = FLOW_ACTION_VLAN_PUSH; entry->vlan.vid = other_tuple->encap[i].id; entry->vlan.proto = other_tuple->encap[i].proto; break; } } return 0; } int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv4_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } if (test_bit(NF_FLOW_SNAT, &flow->flags) || test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); flow_offload_redirect(net, flow, dir, flow_rule); return 0; } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv6_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv6_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } flow_offload_redirect(net, flow, dir, flow_rule); return 0; } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); #define NF_FLOW_RULE_ACTION_MAX 16 static struct nf_flow_rule * nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { const struct nf_flowtable *flowtable = offload->flowtable; const struct flow_offload_tuple *tuple, *other_tuple; struct flow_offload *flow = offload->flow; struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); if (!flow_rule) goto err_flow; flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); if (!flow_rule->rule) goto err_flow_rule; flow_rule->rule->match.dissector = &flow_rule->match.dissector; flow_rule->rule->match.mask = &flow_rule->match.mask; flow_rule->rule->match.key = &flow_rule->match.key; tuple = &flow->tuplehash[dir].tuple; other_tuple = &flow->tuplehash[!dir].tuple; if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) other_dst = other_tuple->dst_cache; err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); if (err < 0) goto err_flow_match; flow_rule->rule->action.num_entries = 0; if (flowtable->type->action(net, flow, dir, flow_rule) < 0) goto err_flow_match; return flow_rule; err_flow_match: kfree(flow_rule->rule); err_flow_rule: kfree(flow_rule); err_flow: return NULL; } static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry; int i; for (i = 0; i < flow_rule->rule->action.num_entries; i++) { entry = &flow_rule->rule->action.entries[i]; if (entry->id != FLOW_ACTION_REDIRECT) continue; dev_put(entry->dev); } kfree(flow_rule->rule); kfree(flow_rule); } static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) { int i; for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) __nf_flow_offload_destroy(flow_rule[i]); } static int nf_flow_offload_alloc(const struct flow_offload_work *offload, struct nf_flow_rule *flow_rule[]) { struct net *net = read_pnet(&offload->flowtable->net); flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, FLOW_OFFLOAD_DIR_ORIGINAL); if (!flow_rule[0]) return -ENOMEM; flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, FLOW_OFFLOAD_DIR_REPLY); if (!flow_rule[1]) { __nf_flow_offload_destroy(flow_rule[0]); return -ENOMEM; } return 0; } static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, __be16 proto, int priority, enum flow_cls_command cmd, const struct flow_offload_tuple *tuple, struct netlink_ext_ack *extack) { cls_flow->common.protocol = proto; cls_flow->common.prio = priority; cls_flow->common.extack = extack; cls_flow->command = cmd; cls_flow->cookie = (unsigned long)tuple; } static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct flow_offload *flow, struct nf_flow_rule *flow_rule, enum flow_offload_tuple_dir dir, int priority, int cmd, struct flow_stats *stats, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; struct netlink_ext_ack extack = {}; struct flow_block_cb *block_cb; __be16 proto = ETH_P_ALL; int err, i = 0; nf_flow_offload_init(&cls_flow, proto, priority, cmd, &flow->tuplehash[dir].tuple, &extack); if (cmd == FLOW_CLS_REPLACE) cls_flow.rule = flow_rule->rule; down_read(&flowtable->flow_block_lock); list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); if (err < 0) continue; i++; } up_read(&flowtable->flow_block_lock); if (cmd == FLOW_CLS_STATS) memcpy(stats, &cls_flow.stats, sizeof(*stats)); return i; } static int flow_offload_tuple_add(struct flow_offload_work *offload, struct nf_flow_rule *flow_rule, enum flow_offload_tuple_dir dir) { return nf_flow_offload_tuple(offload->flowtable, offload->flow, flow_rule, dir, offload->flowtable->priority, FLOW_CLS_REPLACE, NULL, &offload->flowtable->flow_block.cb_list); } static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, offload->flowtable->priority, FLOW_CLS_DESTROY, NULL, &offload->flowtable->flow_block.cb_list); } static int flow_offload_rule_add(struct flow_offload_work *offload, struct nf_flow_rule *flow_rule[]) { int ok_count = 0; ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) ok_count += flow_offload_tuple_add(offload, flow_rule[1], FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; return 0; } static void flow_offload_work_add(struct flow_offload_work *offload) { struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; int err; err = nf_flow_offload_alloc(offload, flow_rule); if (err < 0) return; err = flow_offload_rule_add(offload, flow_rule); if (err < 0) goto out; set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); out: nf_flow_offload_destroy(flow_rule); } static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } static void flow_offload_tuple_stats(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir, struct flow_stats *stats) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, offload->flowtable->priority, FLOW_CLS_STATS, stats, &offload->flowtable->flow_block.cb_list); } static void flow_offload_work_stats(struct flow_offload_work *offload) { struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, lastused + flow_offload_get_timeout(offload->flow)); if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { if (stats[0].pkts) nf_ct_acct_add(offload->flow->ct, FLOW_OFFLOAD_DIR_ORIGINAL, stats[0].pkts, stats[0].bytes); if (stats[1].pkts) nf_ct_acct_add(offload->flow->ct, FLOW_OFFLOAD_DIR_REPLY, stats[1].pkts, stats[1].bytes); } } static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload; struct net *net; offload = container_of(work, struct flow_offload_work, work); net = read_pnet(&offload->flowtable->net); switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del); break; case FLOW_CLS_STATS: flow_offload_work_stats(offload); NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats); break; default: WARN_ON_ONCE(1); } clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags); kfree(offload); } static void flow_offload_queue_work(struct flow_offload_work *offload) { struct net *net = read_pnet(&offload->flowtable->net); if (offload->cmd == FLOW_CLS_REPLACE) { NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add); queue_work(nf_flow_offload_add_wq, &offload->work); } else if (offload->cmd == FLOW_CLS_DESTROY) { NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del); queue_work(nf_flow_offload_del_wq, &offload->work); } else { NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats); queue_work(nf_flow_offload_stats_wq, &offload->work); } } static struct flow_offload_work * nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, struct flow_offload *flow, unsigned int cmd) { struct flow_offload_work *offload; if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags)) return NULL; offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); if (!offload) { clear_bit(NF_FLOW_HW_PENDING, &flow->flags); return NULL; } offload->cmd = cmd; offload->flow = flow; offload->flowtable = flowtable; INIT_WORK(&offload->work, flow_offload_work_handler); return offload; } void nf_flow_offload_add(struct nf_flowtable *flowtable, struct flow_offload *flow) { struct flow_offload_work *offload; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); if (!offload) return; flow_offload_queue_work(offload); } void nf_flow_offload_del(struct nf_flowtable *flowtable, struct flow_offload *flow) { struct flow_offload_work *offload; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); if (!offload) return; set_bit(NF_FLOW_HW_DYING, &flow->flags); flow_offload_queue_work(offload); } void nf_flow_offload_stats(struct nf_flowtable *flowtable, struct flow_offload *flow) { struct flow_offload_work *offload; __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10)) return; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); if (!offload) return; flow_offload_queue_work(offload); } void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) { flush_workqueue(nf_flow_offload_del_wq); nf_flow_table_gc_run(flowtable); } } void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) { flush_workqueue(nf_flow_offload_add_wq); flush_workqueue(nf_flow_offload_del_wq); flush_workqueue(nf_flow_offload_stats_wq); } } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, struct flow_block_offload *bo, enum flow_block_command cmd) { struct flow_block_cb *block_cb, *next; int err = 0; down_write(&flowtable->flow_block_lock); switch (cmd) { case FLOW_BLOCK_BIND: list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); break; case FLOW_BLOCK_UNBIND: list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } up_write(&flowtable->flow_block_lock); return err; } static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nf_flowtable *flowtable, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &flowtable->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &flowtable->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) { struct nf_flowtable *flowtable = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; nf_flow_table_gc_cleanup(flowtable, dev); down_write(&flowtable->flow_block_lock); list_del(&block_cb->list); list_del(&block_cb->driver_list); flow_block_cb_free(block_cb); up_write(&flowtable->flow_block_lock); } static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd, struct netlink_ext_ack *extack) { nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo, nf_flow_table_indr_cleanup); } static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd, struct netlink_ext_ack *extack) { int err; nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); down_write(&flowtable->flow_block_lock); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); up_write(&flowtable->flow_block_lock); if (err < 0) return err; return 0; } int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; if (!nf_flowtable_hw_offload(flowtable)) return nf_flow_offload_xdp_setup(flowtable, dev, cmd); if (dev->netdev_ops->ndo_setup_tc) err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); else err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; return nf_flow_table_block_setup(flowtable, &bo, cmd); } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); int nf_flow_table_offload_init(void) { nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add", WQ_UNBOUND | WQ_SYSFS, 0); if (!nf_flow_offload_add_wq) return -ENOMEM; nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del", WQ_UNBOUND | WQ_SYSFS, 0); if (!nf_flow_offload_del_wq) goto err_del_wq; nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats", WQ_UNBOUND | WQ_SYSFS, 0); if (!nf_flow_offload_stats_wq) goto err_stats_wq; return 0; err_stats_wq: destroy_workqueue(nf_flow_offload_del_wq); err_del_wq: destroy_workqueue(nf_flow_offload_add_wq); return -ENOMEM; } void nf_flow_table_offload_exit(void) { destroy_workqueue(nf_flow_offload_add_wq); destroy_workqueue(nf_flow_offload_del_wq); destroy_workqueue(nf_flow_offload_stats_wq); }
9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 // SPDX-License-Identifier: GPL-2.0 /* * Neil Brown <neilb@cse.unsw.edu.au> * J. Bruce Fields <bfields@umich.edu> * Andy Adamson <andros@umich.edu> * Dug Song <dugsong@monkey.org> * * RPCSEC_GSS server authentication. * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 * (gssapi) * * The RPCSEC_GSS involves three stages: * 1/ context creation * 2/ data exchange * 3/ context destruction * * Context creation is handled largely by upcalls to user-space. * In particular, GSS_Accept_sec_context is handled by an upcall * Data exchange is handled entirely within the kernel * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. * Context destruction is handled in-kernel * GSS_Delete_sec_context is in-kernel * * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. * The context handle and gss_token are used as a key into the rpcsec_init cache. * The content of this cache includes some of the outputs of GSS_Accept_sec_context, * being major_status, minor_status, context_handle, reply_token. * These are sent back to the client. * Sequence window management is handled by the kernel. The window size if currently * a compile time constant. * * When user-space is happy that a context is established, it places an entry * in the rpcsec_context cache. The key for this cache is the context_handle. * The content includes: * uid/gidlist - for determining access rights * mechanism type * mechanism specific information, such as a key * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/user_namespace.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/gss_krb5.h> #include <trace/events/rpcgss.h> #include "gss_rpc_upcall.h" /* * Unfortunately there isn't a maximum checksum size exported via the * GSS API. Manufacture one based on GSS mechanisms supported by this * implementation. */ #define GSS_MAX_CKSUMSIZE (GSS_KRB5_TOK_HDR_LEN + GSS_KRB5_MAX_CKSUM_LEN) /* * This value may be increased in the future to accommodate other * usage of the scratch buffer. */ #define GSS_SCRATCH_SIZE GSS_MAX_CKSUMSIZE struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; u32 gsd_databody_offset; struct rsc *rsci; /* for temporary results */ __be32 gsd_seq_num; u8 gsd_scratch[GSS_SCRATCH_SIZE]; }; /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * * Key is context handle (\x if empty) and gss_token. * Content is major_status minor_status (integers) context_handle, reply_token. * */ static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) { return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); } #define RSI_HASHBITS 6 #define RSI_HASHMAX (1<<RSI_HASHBITS) struct rsi { struct cache_head h; struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item); static void rsi_free(struct rsi *rsii) { kfree(rsii->in_handle.data); kfree(rsii->in_token.data); kfree(rsii->out_handle.data); kfree(rsii->out_token.data); } static void rsi_free_rcu(struct rcu_head *head) { struct rsi *rsii = container_of(head, struct rsi, rcu_head); rsi_free(rsii); kfree(rsii); } static void rsi_put(struct kref *ref) { struct rsi *rsii = container_of(ref, struct rsi, h.ref); call_rcu(&rsii->rcu_head, rsi_free_rcu); } static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); } static int rsi_match(struct cache_head *a, struct cache_head *b) { struct rsi *item = container_of(a, struct rsi, h); struct rsi *tmp = container_of(b, struct rsi, h); return netobj_equal(&item->in_handle, &tmp->in_handle) && netobj_equal(&item->in_token, &tmp->in_token); } static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) { dst->len = len; dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL); if (len && !dst->data) return -ENOMEM; return 0; } static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) { return dup_to_netobj(dst, src->data, src->len); } static void rsi_init(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); new->out_handle.data = NULL; new->out_handle.len = 0; new->out_token.data = NULL; new->out_token.len = 0; new->in_handle.len = item->in_handle.len; item->in_handle.len = 0; new->in_token.len = item->in_token.len; item->in_token.len = 0; new->in_handle.data = item->in_handle.data; item->in_handle.data = NULL; new->in_token.data = item->in_token.data; item->in_token.data = NULL; } static void update_rsi(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); BUG_ON(new->out_handle.data || new->out_token.data); new->out_handle.len = item->out_handle.len; item->out_handle.len = 0; new->out_token.len = item->out_token.len; item->out_token.len = 0; new->out_handle.data = item->out_handle.data; item->out_handle.data = NULL; new->out_token.data = item->out_token.data; item->out_token.data = NULL; new->major_status = item->major_status; new->minor_status = item->minor_status; } static struct cache_head *rsi_alloc(void) { struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); if (rsii) return &rsii->h; else return NULL; } static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { struct rsi *rsii = container_of(h, struct rsi, h); qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; WARN_ONCE(*blen < 0, "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) { /* context token expiry major minor context token */ char *buf = mesg; char *ep; int len; struct rsi rsii, *rsip = NULL; time64_t expiry; int status = -EINVAL; memset(&rsii, 0, sizeof(rsii)); /* handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_handle, buf, len)) goto out; /* token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_token, buf, len)) goto out; rsip = rsi_lookup(cd, &rsii); if (!rsip) goto out; rsii.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.major_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.minor_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; /* out_handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_handle, buf, len)) goto out; /* out_token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_token, buf, len)) goto out; rsii.h.expiry_time = expiry; rsip = rsi_update(cd, &rsii, rsip); status = 0; out: rsi_free(&rsii); if (rsip) cache_put(&rsip->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, .init = rsi_init, .update = update_rsi, .alloc = rsi_alloc, }; static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) { struct cache_head *ch; int hash = rsi_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old) { struct cache_head *ch; int hash = rsi_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } /* * The rpcsec_context cache is used to store a context that is * used in data exchange. * The key is a context handle. The content is: * uid, gidlist, mechanism, service-set, mech-specific-data */ #define RSC_HASHBITS 10 #define RSC_HASHMAX (1<<RSC_HASHBITS) #define GSS_SEQ_WIN 128 struct gss_svc_seq_data { /* highest seq number seen so far: */ u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; spinlock_t sd_lock; }; struct rsc { struct cache_head h; struct xdr_netobj handle; struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item); static void rsc_free(struct rsc *rsci) { kfree(rsci->handle.data); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); } static void rsc_free_rcu(struct rcu_head *head) { struct rsc *rsci = container_of(head, struct rsc, rcu_head); kfree(rsci->handle.data); kfree(rsci); } static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int rsc_hash(struct rsc *rsci) { return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); } static int rsc_match(struct cache_head *a, struct cache_head *b) { struct rsc *new = container_of(a, struct rsc, h); struct rsc *tmp = container_of(b, struct rsc, h); return netobj_equal(&new->handle, &tmp->handle); } static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->handle.len = tmp->handle.len; tmp->handle.len = 0; new->handle.data = tmp->handle.data; tmp->handle.data = NULL; new->mechctx = NULL; init_svc_cred(&new->cred); } static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->mechctx = tmp->mechctx; tmp->mechctx = NULL; memset(&new->seqdata, 0, sizeof(new->seqdata)); spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; init_svc_cred(&tmp->cred); } static struct cache_head * rsc_alloc(void) { struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); if (rsci) return &rsci->h; else return NULL; } static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) { return -EINVAL; } static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ char *buf = mesg; int id; int len, rv; struct rsc rsci, *rscp = NULL; time64_t expiry; int status = -EINVAL; struct gss_api_mech *gm = NULL; memset(&rsci, 0, sizeof(rsci)); /* context handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsci.handle, buf, len)) goto out; rsci.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* uid, or NEGATIVE */ rv = get_int(&mesg, &id); if (rv == -EINVAL) goto out; if (rv == -ENOENT) set_bit(CACHE_NEGATIVE, &rsci.h.flags); else { int N, i; /* * NOTE: we skip uid_valid()/gid_valid() checks here: * instead, * -1 id's are later mapped to the * (export-specific) anonymous id by nfsd_setuser. * * (But supplementary gid's get no such special * treatment so are checked for validity here.) */ /* uid */ rsci.cred.cr_uid = make_kuid(current_user_ns(), id); /* gid */ if (get_int(&mesg, &id)) goto out; rsci.cred.cr_gid = make_kgid(current_user_ns(), id); /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; if (N < 0 || N > NGROUPS_MAX) goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) goto out; /* gid's */ status = -EINVAL; for (i=0; i<N; i++) { kgid_t kgid; if (get_int(&mesg, &id)) goto out; kgid = make_kgid(current_user_ns(), id); if (!gid_valid(kgid)) goto out; rsci.cred.cr_group_info->gid[i] = kgid; } groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf); status = -EOPNOTSUPP; if (!gm) goto out; status = -EINVAL; /* mech-specific data: */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, NULL, GFP_KERNEL); if (status) goto out; /* get client name */ len = qword_get(&mesg, buf, mlen); if (len > 0) { rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); if (!rsci.cred.cr_principal) { status = -ENOMEM; goto out; } } } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, .update = update_rsc, .alloc = rsc_alloc, }; static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) { struct cache_head *ch; int hash = rsc_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old) { struct cache_head *ch; int hash = rsc_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc * gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) { struct rsc rsci; struct rsc *found; memset(&rsci, 0, sizeof(rsci)); if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) return NULL; found = rsc_lookup(cd, &rsci); rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) return NULL; return found; } /** * gss_check_seq_num - GSS sequence number window check * @rqstp: RPC Call to use when reporting errors * @rsci: cached GSS context state (updated on return) * @seq_num: sequence number to check * * Implements sequence number algorithm as specified in * RFC 2203, Section 5.3.3.1. "Context Management". * * Return values: * %true: @rqstp's GSS sequence number is inside the window * %false: @rqstp's GSS sequence number is outside the window */ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) goto alreadyseen; ok: result = true; out: spin_unlock(&sd->sd_lock); return result; toolow: trace_rpcgss_svc_seqno_low(rqstp, seq_num, sd->sd_max - GSS_SEQ_WIN, sd->sd_max); goto out; alreadyseen: trace_rpcgss_svc_seqno_seen(rqstp, seq_num); goto out; } /* * Decode and verify a Call's verifier field. For RPC_AUTH_GSS Calls, * the body of this field contains a variable length checksum. * * GSS-specific auth_stat values are mandated by RFC 2203 Section * 5.3.3.3. */ static int svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct gss_ctx *ctx_id = rsci->mechctx; u32 flavor, maj_stat; struct xdr_buf rpchdr; struct xdr_netobj checksum; struct kvec iov; /* * Compute the checksum of the incoming Call from the * XID field to credential field: */ iov.iov_base = rpcstart; iov.iov_len = (u8 *)xdr->p - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, (void **)&checksum.data, &checksum.len) < 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (flavor != RPC_AUTH_GSS) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (rqstp->rq_deferred) return SVC_OK; maj_stat = gss_verify_mic(ctx_id, &rpchdr, &checksum); if (maj_stat != GSS_S_COMPLETE) { trace_rpcgss_svc_mic(rqstp, maj_stat); rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; return SVC_OK; } /* * Construct and encode a Reply's verifier field. The verifier's body * field contains a variable-length checksum of the GSS sequence * number. */ static bool svcauth_gss_encode_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj checksum; struct kvec iov; gsd->gsd_seq_num = cpu_to_be32(seq); iov.iov_base = &gsd->gsd_seq_num; iov.iov_len = XDR_UNIT; xdr_buf_from_iov(&iov, &verf_data); checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(ctx_id, &verf_data, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; return xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_GSS, checksum.data, checksum.len) > 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return false; } struct gss_domain { struct auth_domain h; u32 pseudoflavor; }; static struct auth_domain * find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) { char *name; name = gss_service_to_auth_domain_name(ctx->mech_type, svc); if (!name) return NULL; return auth_domain_find(name); } static struct auth_ops svcauthops_gss; u32 svcauth_gss_flavor(struct auth_domain *dom) { struct gss_domain *gd = container_of(dom, struct gss_domain, h); return gd->pseudoflavor; } EXPORT_SYMBOL_GPL(svcauth_gss_flavor); struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; struct auth_domain *test; int stat = -ENOMEM; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out; kref_init(&new->h.ref); new->h.name = kstrdup(name, GFP_KERNEL); if (!new->h.name) goto out_free_dom; new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; test = auth_domain_lookup(name, &new->h); if (test != &new->h) { pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", name); stat = -EADDRINUSE; auth_domain_put(test); goto out_free_name; } return test; out_free_name: kfree(new->h.name); out_free_dom: kfree(new); out: return ERR_PTR(stat); } EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, offset, seq_num, maj_stat; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = xdr_stream_pos(xdr); if (xdr_buf_subsegment(buf, &databody_integ, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the @seq_num field. The next * XDR data item is the @arg field, which contains the clear * text RPC program payload. The checksum, which follows the * @arg field, is located and decoded without updating the * xdr_stream. */ offset += len; if (xdr_decode_word(buf, offset, &checksum.len)) goto unwrap_failed; if (checksum.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; checksum.data = gsd->gsd_scratch; if (read_bytes_from_xdr_buf(buf, offset + XDR_UNIT, checksum.data, checksum.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; /* The received seqno is protected by the checksum. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; xdr_truncate_decode(xdr, XDR_UNIT + checksum.len); return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, maj_stat, seq_num, offset; struct xdr_buf *buf = xdr->buf; unsigned int saved_len; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { /* Already decrypted last time through! The sequence number * check at out_seq is unnecessary but harmless: */ goto out_seq; } if (len > xdr_stream_remaining(xdr)) goto unwrap_failed; offset = xdr_stream_pos(xdr); saved_len = buf->len; maj_stat = gss_unwrap(ctx, offset, offset + len, buf); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; xdr->nwords -= XDR_QUADLEN(saved_len - buf->len); out_seq: /* gss_unwrap() decrypted the sequence number. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_unwrap: trace_rpcgss_svc_unwrap(rqstp, maj_stat); return -EINVAL; } static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; struct rsc *rsci = svcdata->rsci; struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; rqstp->rq_auth_stat = rpc_autherr_badcred; /* * A gss export can be specified either by: * export *(sec=krb5,rw) * or by * export gss/krb5(rw) * The latter is deprecated; but for backwards compatibility reasons * the nfsd code will still fall back on trying it if the former * doesn't work; so we try to make both available to nfsd, below. */ rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); if (rqstp->rq_gssclient == NULL) return SVC_DENIED; stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } static bool svcauth_gss_proc_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, struct xdr_netobj *out_handle, int *major_status, u32 seq_num) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rsc *rsci; bool rc; if (*major_status != GSS_S_COMPLETE) goto null_verifier; rsci = gss_svc_searchbyctx(cd, out_handle); if (rsci == NULL) { *major_status = GSS_S_NO_CONTEXT; goto null_verifier; } rc = svcauth_gss_encode_verf(rqstp, rsci->mechctx, seq_num); cache_put(&rsci->h, cd); return rc; null_verifier: return xdr_stream_encode_opaque_auth(xdr, RPC_AUTH_NULL, NULL, 0) > 0; } static void gss_free_in_token_pages(struct gssp_in_token *in_token) { int i; i = 0; while (in_token->pages[i]) put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } static int gss_read_proxy_verf(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; unsigned int length, pgto_offs, pgfrom_offs; int pages, i, pgto, pgfrom; size_t to_offs, from_offs; u32 inlen; if (dup_netobj(in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &inlen) < 0) goto out_denied_free; if (inlen > xdr_stream_remaining(xdr)) goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { gss_free_in_token_pages(in_token); goto out_denied_free; } } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; from_offs = rqstp->rq_arg.page_base; while (inlen) { pgto = to_offs >> PAGE_SHIFT; pgfrom = from_offs >> PAGE_SHIFT; pgto_offs = to_offs & ~PAGE_MASK; pgfrom_offs = from_offs & ~PAGE_MASK; length = min_t(unsigned int, inlen, min_t(unsigned int, PAGE_SIZE - pgto_offs, PAGE_SIZE - pgfrom_offs)); memcpy(page_address(in_token->pages[pgto]) + pgto_offs, page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs, length); to_offs += length; from_offs += length; inlen -= length; } return 0; out_denied_free: kfree(in_handle->data); return SVC_DENIED; } /* * RFC 2203, Section 5.2.3.1. * * struct rpc_gss_init_res { * opaque handle<>; * unsigned int gss_major; * unsigned int gss_minor; * unsigned int seq_window; * opaque gss_token<>; * }; */ static bool svcxdr_encode_gss_init_res(struct xdr_stream *xdr, struct xdr_netobj *handle, struct xdr_netobj *gss_token, unsigned int major_status, unsigned int minor_status, u32 seq_num) { if (xdr_stream_encode_opaque(xdr, handle->data, handle->len) < 0) return false; if (xdr_stream_encode_u32(xdr, major_status) < 0) return false; if (xdr_stream_encode_u32(xdr, minor_status) < 0) return false; if (xdr_stream_encode_u32(xdr, seq_num) < 0) return false; if (xdr_stream_encode_opaque(xdr, gss_token->data, gss_token->len) < 0) return false; return true; } /* * Having read the cred already and found we're in the context * initiation case, read the verifier and initiate (or check the results * of) upcalls to userspace for help with context initiation. If * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct rsi *rsip, rsikey; __be32 *p; u32 len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &len) < 0) { kfree(rsikey.in_handle.data); return SVC_DENIED; } p = xdr_inline_decode(xdr, len); if (!p) { kfree(rsikey.in_handle.data); return SVC_DENIED; } rsikey.in_token.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(rsikey.in_token.data)) { kfree(rsikey.in_handle.data); return SVC_CLOSE; } memcpy(rsikey.in_token.data, p, len); rsikey.in_token.len = len; /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(sn->rsi_cache, &rsikey); rsi_free(&rsikey); if (!rsip) return SVC_CLOSE; if (cache_check(sn->rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) /* No upcall result: */ return SVC_CLOSE; ret = SVC_CLOSE; if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &rsip->out_handle, &rsip->major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &rsip->out_handle, &rsip->out_token, rsip->major_status, rsip->minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: cache_put(&rsip->h, sn->rsi_cache); return ret; } static int gss_proxy_save_rsc(struct cache_detail *cd, struct gssp_upcall_data *ud, uint64_t *handle) { struct rsc rsci, *rscp = NULL; static atomic64_t ctxhctr; long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ status = -ENOMEM; /* the handle needs to be just a unique id, * use a static counter */ ctxh = atomic64_inc_return(&ctxhctr); /* make a copy for the caller */ *handle = ctxh; /* make a copy for the rsc cache */ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t))) goto out; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* creds */ if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ goto out; } else { struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; memset(&ud->creds, 0, sizeof(struct svc_cred)); status = -EOPNOTSUPP; /* get mech handle from OID */ gm = gss_mech_get_by_OID(&ud->mech_oid); if (!gm) goto out; rsci.cred.cr_gss_mech = gm; status = -EINVAL; /* mech-specific data: */ status = gss_import_sec_context(ud->out_handle.data, ud->out_handle.len, gm, &rsci.mechctx, &expiry, GFP_KERNEL); if (status) goto out; getboottime64(&boot); expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_netobj cli_handle; struct gssp_upcall_data ud; uint64_t handle; int status; int ret; struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; ret = SVC_CLOSE; /* Perform synchronous upcall to gss-proxy */ status = gssp_accept_sec_context_upcall(net, &ud); if (status) goto out; trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: cli_handle = ud.out_handle; break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); if (status) goto out; cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: goto out; } if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &cli_handle, &ud.major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &cli_handle, &ud.out_token, ud.major_status, ud.minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: gss_free_in_token_pages(&ud.in_token); gssp_free_upcall_data(&ud); return ret; } /* * Try to set the sn->use_gss_proxy variable to a new value. We only allow * it to be changed if it's currently undefined (-1). If it's any other value * then return -EBUSY unless the type wouldn't have changed anyway. */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; WARN_ON_ONCE(type != 0 && type != 1); ret = cmpxchg(&sn->use_gss_proxy, -1, type); if (ret != -1 && ret != type) return -EBUSY; return 0; } static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* If use_gss_proxy is still undefined, then try to disable it */ if (sn->use_gss_proxy == -1) set_gss_proxy(net, 0); return sn->use_gss_proxy; } static noinline_for_stack int svcauth_gss_proc_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 flavor, len; void *body; /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) return SVC_GARBAGE; if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (!use_gss_proxy(SVC_NET(rqstp))) return svcauth_gss_legacy_init(rqstp, gc); return svcauth_gss_proxy_init(rqstp, gc); } #ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; res = kstrtoul(tbuf, 0, &i); if (res) return res; if (i != 1) return -EINVAL; res = set_gssp_clnt(net); if (res) return res; res = set_gss_proxy(net, 1); if (res) return res; return count; } static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; len -= p; if (len > count) len = count; if (copy_to_user(buf, (void *)(tbuf+p), len)) return -EFAULT; *ppos += len; return len; } static const struct proc_ops use_gss_proxy_proc_ops = { .proc_open = nonseekable_open, .proc_write = write_gssp, .proc_read = read_gssp, }; static int create_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_proc_ops, net); if (!*p) return -ENOMEM; init_gssp_clnt(sn); return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->use_gssp_proc) { remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); clear_gssp_clnt(sn); } } static ssize_t read_gss_krb5_enctypes(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct rpcsec_gss_oid oid = { .len = 9, .data = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", }; struct gss_api_mech *mech; ssize_t ret; mech = gss_mech_get_by_OID(&oid); if (!mech) return 0; if (!mech->gm_upcall_enctypes) { gss_mech_put(mech); return 0; } ret = simple_read_from_buffer(buf, count, ppos, mech->gm_upcall_enctypes, strlen(mech->gm_upcall_enctypes)); gss_mech_put(mech); return ret; } static const struct proc_ops gss_krb5_enctypes_proc_ops = { .proc_open = nonseekable_open, .proc_read = read_gss_krb5_enctypes, }; static int create_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); sn->gss_krb5_enctypes = proc_create_data("gss_krb5_enctypes", S_IFREG | 0444, sn->proc_net_rpc, &gss_krb5_enctypes_proc_ops, net); return sn->gss_krb5_enctypes ? 0 : -ENOMEM; } static void destroy_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->gss_krb5_enctypes) remove_proc_entry("gss_krb5_enctypes", sn->proc_net_rpc); } #else /* CONFIG_PROC_FS */ static int create_use_gss_proxy_proc_entry(struct net *net) { return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) {} static int create_krb5_enctypes_proc_entry(struct net *net) { return 0; } static void destroy_krb5_enctypes_proc_entry(struct net *net) {} #endif /* CONFIG_PROC_FS */ /* * The Call's credential body should contain a struct rpc_gss_cred_t. * * RFC 2203 Section 5 * * struct rpc_gss_cred_t { * union switch (unsigned int version) { * case RPCSEC_GSS_VERS_1: * struct { * rpc_gss_proc_t gss_proc; * unsigned int seq_num; * rpc_gss_service_t service; * opaque handle<>; * } rpc_gss_cred_vers_1_t; * } * }; */ static bool svcauth_gss_decode_credbody(struct xdr_stream *xdr, struct rpc_gss_wire_cred *gc, __be32 **rpcstart) { ssize_t handle_len; u32 body_len; __be32 *p; p = xdr_inline_decode(xdr, XDR_UNIT); if (!p) return false; /* * start of rpc packet is 7 u32's back from here: * xid direction rpcversion prog vers proc flavour */ *rpcstart = p - 7; body_len = be32_to_cpup(p); if (body_len > RPC_MAX_AUTH_SIZE) return false; /* struct rpc_gss_cred_t */ if (xdr_stream_decode_u32(xdr, &gc->gc_v) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_proc) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_seq) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_svc) < 0) return false; handle_len = xdr_stream_decode_opaque_inline(xdr, (void **)&gc->gc_ctx.data, body_len); if (handle_len < 0) return false; if (body_len != XDR_UNIT * 5 + xdr_align_size(handle_len)) return false; gc->gc_ctx.len = handle_len; return true; } /** * svcauth_gss_accept - Decode and validate incoming RPC_AUTH_GSS credential * @rqstp: RPC transaction * * Return values: * %SVC_OK: Success * %SVC_COMPLETE: GSS context lifetime event * %SVC_DENIED: Credential or verifier is not valid * %SVC_GARBAGE: Failed to decode credential or verifier * %SVC_CLOSE: Temporary failure * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; __be32 *rpcstart; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; svcdata->gsd_databody_offset = 0; svcdata->rsci = NULL; gc = &svcdata->clcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) goto auth_err; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (rqstp->rq_proc != 0) goto auth_err; return svcauth_gss_proc_init(rqstp, gc); case RPC_GSS_PROC_DESTROY: if (rqstp->rq_proc != 0) goto auth_err; fallthrough; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; switch (svcauth_gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: goto auth_err; case SVC_DROP: goto drop; } break; default: if (rqstp->rq_proc != 0) goto auth_err; rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } /* now act upon the command: */ switch (gc->gc_proc) { case RPC_GSS_PROC_DESTROY: if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; /* Delete the entry from the cache_list and call cache_put */ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); goto complete; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; svcdata->gsd_databody_offset = xdr_stream_pos(&rqstp->rq_res_stream); rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_integ(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE); break; case RPC_GSS_SVC_PRIVACY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_priv(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE * 2); break; default: goto auth_err; } svcdata->rsci = rsci; cache_get(&rsci->h); rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( rsci->mechctx->mech_type, GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: ret = SVC_GARBAGE; goto out; auth_err: xdr_truncate_encode(&rqstp->rq_res_stream, XDR_UNIT * 2); ret = SVC_DENIED; goto out; complete: ret = SVC_COMPLETE; goto out; drop: ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); return ret; } static u32 svcauth_gss_prepare_to_wrap(struct svc_rqst *rqstp, struct gss_svc_data *gsd) { u32 offset; /* Release can be called twice, but we only wrap once. */ offset = gsd->gsd_databody_offset; gsd->gsd_databody_offset = 0; /* AUTH_ERROR replies are not wrapped. */ if (rqstp->rq_auth_stat != rpc_auth_ok) return 0; /* Also don't wrap if the accept_stat is nonzero: */ if (*rqstp->rq_accept_statp != rpc_success) return 0; return offset; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * The RPC Reply message has already been XDR-encoded. rq_res_stream * is now positioned so that the checksum can be written just past * the RPC Reply message. */ static int svcauth_gss_wrap_integ(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; u32 offset, maj_stat; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) goto out; if (xdr_buf_subsegment(buf, &databody_integ, offset + XDR_UNIT, buf->len - offset - XDR_UNIT)) goto wrap_failed; /* Buffer space for these has already been reserved in * svcauth_gss_accept(). */ if (xdr_encode_word(buf, offset, databody_integ.len)) goto wrap_failed; if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(gsd->rsci->mechctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; if (xdr_stream_encode_opaque(xdr, checksum.data, checksum.len) < 0) goto wrap_failed; xdr_commit_encode(xdr); out: return 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return -EINVAL; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * gss_wrap() expands the size of the RPC message payload in the * response buffer. The main purpose of svcauth_gss_wrap_priv() * is to ensure there is adequate space in the response buffer to * avoid overflow during the wrap. */ static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = &rqstp->rq_res; struct kvec *head = buf->head; struct kvec *tail = buf->tail; u32 offset, pad, maj_stat; __be32 *p; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) return 0; /* * Buffer space for this field has already been reserved * in svcauth_gss_accept(). Note that the GSS sequence * number is encrypted along with the RPC reply payload. */ if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; /* * If there is currently tail data, make sure there is * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in * the page, and move the current tail data such that * there is RPC_MAX_AUTH_SIZE slack space available in * both the head and tail. */ if (tail->iov_base) { if (tail->iov_base >= head->iov_base + PAGE_SIZE) goto wrap_failed; if (tail->iov_base < head->iov_base) goto wrap_failed; if (tail->iov_len + head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; memmove(tail->iov_base + RPC_MAX_AUTH_SIZE, tail->iov_base, tail->iov_len); tail->iov_base += RPC_MAX_AUTH_SIZE; } /* * If there is no current tail data, make sure there is * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the * allotted page, and set up tail information such that there * is RPC_MAX_AUTH_SIZE slack space available in both the * head and tail. */ if (!tail->iov_base) { if (head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; tail->iov_base = head->iov_base + head->iov_len + RPC_MAX_AUTH_SIZE; tail->iov_len = 0; } maj_stat = gss_wrap(gsd->rsci->mechctx, offset + XDR_UNIT, buf, buf->pages); if (maj_stat != GSS_S_COMPLETE) goto bad_wrap; /* Wrapping can change the size of databody_priv. */ if (xdr_encode_word(buf, offset, buf->len - offset - XDR_UNIT)) goto wrap_failed; pad = xdr_pad_size(buf->len - offset - XDR_UNIT); p = (__be32 *)(tail->iov_base + tail->iov_len); memset(p, 0, pad); tail->iov_len += pad; buf->len += pad; return 0; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; bad_wrap: trace_rpcgss_svc_wrap(rqstp, maj_stat); return -ENOMEM; } /** * svcauth_gss_release - Wrap payload and release resources * @rqstp: RPC transaction context * * Return values: * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error */ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc; int stat; if (!gsd) goto out; gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: stat = svcauth_gss_wrap_integ(rqstp); if (stat) goto out_err; break; case RPC_GSS_SVC_PRIVACY: stat = svcauth_gss_wrap_priv(rqstp); if (stat) goto out_err; break; /* * For any other gc_svc value, svcauth_gss_accept() already set * the auth_error appropriately; just fall through: */ } out: stat = 0; out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_gssclient) auth_domain_put(rqstp->rq_gssclient); rqstp->rq_gssclient = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); gsd->rsci = NULL; } return stat; } static void svcauth_gss_domain_release_rcu(struct rcu_head *head) { struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } static void svcauth_gss_domain_release(struct auth_domain *dom) { call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) { return svcauth_gss_flavor(rqstp->rq_gssclient); } static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, .flavour = RPC_AUTH_GSS, .accept = svcauth_gss_accept, .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsi_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsi_cache = cd; return 0; } static void rsi_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsi_cache; sn->rsi_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } static int rsc_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsc_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsc_cache = cd; return 0; } static void rsc_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsc_cache; sn->rsc_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } int gss_svc_init_net(struct net *net) { int rv; rv = rsc_cache_create_net(net); if (rv) return rv; rv = rsi_cache_create_net(net); if (rv) goto out1; rv = create_use_gss_proxy_proc_entry(net); if (rv) goto out2; rv = create_krb5_enctypes_proc_entry(net); if (rv) goto out3; return 0; out3: destroy_use_gss_proxy_proc_entry(net); out2: rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; } void gss_svc_shutdown_net(struct net *net) { destroy_krb5_enctypes_proc_entry(net); destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); } int gss_svc_init(void) { return svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); } void gss_svc_shutdown(void) { svc_auth_unregister(RPC_AUTH_GSS); }
164 166 1 168 1 166 10 10 129 128 1 126 128 125 2 3 129 1 128 122 1 128 126 127 3 2 23 22 23 19 23 22 2 2 23 22 23 20 1 22 21 21 2 2 23 23 23 22 22 22 1 1 1 1 1 94 96 1 23 23 23 23 3 23 23 3 3 3 3 3 3 4 4 4 8 1 11 20 12 8 8 5 2 6 5 1 1 4 4 4 12 12 6 6 6 1 1 12 175 12 12 37 175 2 173 175 174 1 1 168 177 175 165 69 105 104 177 78 177 96 93 94 25 12 23 1 13 23 25 3 25 22 23 22 22 21 1 22 22 22 6 4 5 3 2 25 6 23 1 2 2 25 25 25 25 24 24 24 24 24 24 24 24 1 23 1 1 23 24 24 3 5 5 19 4 23 23 20 23 24 1 5 23 2 2 2 146 146 147 2 2 2 9 5 5 2 1 2 1 2 2 1 1 3 3 1 7 7 1 1 9 9 3 1 2 9 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { struct sk_buff *skb = chunk->skb; return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) goto nodata; cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(cookie + 1, init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; /* Sign the message. */ err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, cookie->signature); if (err) goto free_cookie; } return retval; free_cookie: kfree(retval); nodata: *cookie_len = 0; return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; if (!sctp_sk(ep->base.sk)->hmac) goto no_hmac; /* Check the signature. */ { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, digest); if (err) { *error = -SCTP_IERROR_NOMEM; goto fail; } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* This param has been Deprecated, send ABORT. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), &param); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip) { /* Skip preceding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = (u8 *)(reconf + 1); return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(&current->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; fallthrough; case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; }
6 6 86 82 6 79 80 1 1 1 2 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_ipc; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_ipc: retire_ipc_sysctls(ns); fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, };
2 1 3 6 1 5 6 6 7 7 7 1 6 7 7 7 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Monitoring SMC transport protocol sockets * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/smc_diag.h> #include <net/netlink.h> #include <net/smc.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" struct smc_diag_dump_ctx { int pos[2]; }; static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) { return (struct smc_diag_dump_ctx *)cb->ctx; } static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; if (sk->sk_protocol == SMCPROTO_SMC) { r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_protocol == SMCPROTO_SMC6) { memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, sizeof(smc->clcsock->sk->sk_v6_daddr)); #endif } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct smc_diag_msg *r, struct user_namespace *user_ns) { if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct smc_diag_req *req, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; fallback.reason = smc->fallback_rsn; fallback.peer_diagnosis = smc->peer_diagnosis; if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) goto errout; if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, .sndbuf_size = conn->sndbuf_desc ? conn->sndbuf_desc->len : 0, .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, .rx_prod.count = conn->local_rx_ctrl.prod.count, .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, .rx_cons.count = conn->local_rx_ctrl.cons.count, .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, .tx_prod.count = conn->local_tx_ctrl.prod.count, .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, .tx_cons.count = conn->local_tx_ctrl.cons.count, .tx_prod_flags = *(u8 *)&conn->local_tx_ctrl.prod_flags, .tx_conn_state_flags = *(u8 *)&conn->local_tx_ctrl.conn_state_flags, .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, .rx_conn_state_flags = *(u8 *)&conn->local_rx_ctrl.conn_state_flags, .tx_prep.wrap = conn->tx_curs_prep.wrap, .tx_prep.count = conn->tx_curs_prep.count, .tx_sent.wrap = conn->tx_curs_sent.wrap, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, }; memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; struct smcd_gid smcd_gid; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; smcd->ops->get_local_gid(smcd, &smcd_gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) goto errout; } nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, struct netlink_callback *cb, int p_type) { struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; if (num < snum) goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); if (rc < 0) goto out; next: num++; } out: read_unlock(&prot->h.smc_hash->lock); cb_ctx->pos[p_type] = num; return rc; } static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { struct net *net = sock_net(skb->sk); if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { { struct netlink_dump_control c = { .dump = smc_diag_dump, .min_dump_alloc = SKB_WITH_OVERHEAD(32768), }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } return 0; } static const struct sock_diag_handler smc_diag_handler = { .owner = THIS_MODULE, .family = AF_SMC, .dump = smc_diag_handler_dump, }; static int __init smc_diag_init(void) { return sock_diag_register(&smc_diag_handler); } static void __exit smc_diag_exit(void) { sock_diag_unregister(&smc_diag_handler); } module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
17 6 57 17 17 17 73 72 17 17 17 5 5 5 106 105 75 68 106 106 80 81 79 13 68 66 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0-only #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> #include <linux/kmemleak.h> #include <linux/user_namespace.h> struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ UCOUNTS_HASHTABLE_BITS) #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) { return &current_user_ns()->set; } static int set_is_seen(struct ctl_table_set *set) { return &current_user_ns()->set == set; } static int set_permissions(struct ctl_table_header *head, const struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ if (ns_capable(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ mode = table->mode & S_IROTH; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = set_permissions, }; static long ue_zero = 0; static long ue_int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(long), \ .mode = 0644, \ .proc_handler = proc_doulongvec_minmax, \ .extra1 = &ue_zero, \ .extra2 = &ue_int_max, \ } static const struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif #ifdef CONFIG_FANOTIFY UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif }; #endif /* CONFIG_SYSCTL */ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < UCOUNT_COUNTS; i++) { tbl[i].data = &ns->ucount_max[i]; } ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl, ARRAY_SIZE(user_table)); } if (!ns->sysctls) { kfree(tbl); retire_sysctl_set(&ns->set); return false; } #endif return true; } void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); retire_sysctl_set(&ns->set); kfree(tbl); #endif } static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) { struct ucounts *ucounts; hlist_for_each_entry(ucounts, hashent, node) { if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) return ucounts; } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); spin_lock_irq(&ucounts_lock); hlist_add_head(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) { /* Returns true on a successful get, false if the count wraps. */ return !atomic_add_negative(1, &ucounts->count); } struct ucounts *get_ucounts(struct ucounts *ucounts) { if (!get_ucounts_or_wrap(ucounts)) { put_ucounts(ucounts); ucounts = NULL; } return ucounts; } struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; bool wrapped; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { spin_unlock_irq(&ucounts_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->ns = ns; new->uid = uid; atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { kfree(new); } else { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); if (wrapped) { put_ucounts(ucounts); return NULL; } return ucounts; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); put_user_ns(ucounts->ns); kfree(ucounts); } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { long c, old; c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; } } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type) { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; } void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); } return ret; } bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; } return (new == 0); } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) put_ucounts(iter); } } void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto dec_unwind; if (iter == ucounts) ret = new; if (!override_rlimit) max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. */ if (new != 1) continue; if (!get_ucounts(iter)) goto dec_unwind; } return ret; dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; max = get_userns_rlimit_max(iter->ns, type); } return false; } static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL static struct ctl_table_header *user_header; static struct ctl_table empty[1]; /* * It is necessary to register the user directory in the * default set so that registrations in the child sets work * properly. */ user_header = register_sysctl_sz("user", empty, 0); kmemleak_ignore(user_header); BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init);
3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ipt_entry *e = par->entryinfo; int err; if (e->ip.proto != IPPROTO_TCP || e->ip.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv4_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg4, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg4_check, .destroy = synproxy_tg4_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg4_init(void) { return xt_register_target(&synproxy_tg4_reg); } static void __exit synproxy_tg4_exit(void) { xt_unregister_target(&synproxy_tg4_reg); } module_init(synproxy_tg4_init); module_exit(synproxy_tg4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies");
15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Framework and drivers for configuring and reading different PHYs * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #ifndef __PHY_H #define __PHY_H #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/leds.h> #include <linux/linkmode.h> #include <linux/netlink.h> #include <linux/mdio.h> #include <linux/mii.h> #include <linux/mii_timestamper.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> #include <linux/irqreturn.h> #include <linux/iopoll.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <net/eee.h> #define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ SUPPORTED_TP | \ SUPPORTED_MII) #define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ SUPPORTED_10baseT_Full) #define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ SUPPORTED_100baseT_Full) #define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ SUPPORTED_1000baseT_Full) extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) #define PHY_BASIC_T1S_P2MP_FEATURES ((unsigned long *)&phy_basic_t1s_p2mp_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) #define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) #define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[3]; extern const int phy_basic_t1s_p2mp_features_array[2]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, * or not desired for this PHY. Set to PHY_MAC_INTERRUPT if * the attached MAC driver handles the interrupt */ #define PHY_POLL -1 #define PHY_MAC_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 #define PHY_POLL_CABLE_TEST 0x00000004 #define PHY_ALWAYS_CALL_SUSPEND 0x00000008 #define MDIO_DEVICE_IS_PHY 0x80000000 /** * enum phy_interface_t - Interface Mode definitions * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined * @PHY_INTERFACE_MODE_MII: Media-independent interface * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface * @PHY_INTERFACE_MODE_REVRMII: Reduced Media Independent Interface in PHY role * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_PSGMII: Penta SGMII * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_10G_QXGMII: 10G-QXGMII - 4 ports over 10G USXGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_SGMII, PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_REVMII, PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_REVRMII, PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID, PHY_INTERFACE_MODE_RGMII_TXID, PHY_INTERFACE_MODE_RTBI, PHY_INTERFACE_MODE_SMII, PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_XLGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_PSGMII, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_5GBASER, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_25GBASER, PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_10G_QXGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; /* PHY interface mode bitmap handling */ #define DECLARE_PHY_INTERFACE_MASK(name) \ DECLARE_BITMAP(name, PHY_INTERFACE_MODE_MAX) static inline void phy_interface_zero(unsigned long *intf) { bitmap_zero(intf, PHY_INTERFACE_MODE_MAX); } static inline bool phy_interface_empty(const unsigned long *intf) { return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_and(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_and(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_or(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_or(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_set_rgmii(unsigned long *intf) { __set_bit(PHY_INTERFACE_MODE_RGMII, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_ID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_RXID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); } /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, unsigned int size); /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { switch (interface) { case PHY_INTERFACE_MODE_NA: return ""; case PHY_INTERFACE_MODE_INTERNAL: return "internal"; case PHY_INTERFACE_MODE_MII: return "mii"; case PHY_INTERFACE_MODE_GMII: return "gmii"; case PHY_INTERFACE_MODE_SGMII: return "sgmii"; case PHY_INTERFACE_MODE_TBI: return "tbi"; case PHY_INTERFACE_MODE_REVMII: return "rev-mii"; case PHY_INTERFACE_MODE_RMII: return "rmii"; case PHY_INTERFACE_MODE_REVRMII: return "rev-rmii"; case PHY_INTERFACE_MODE_RGMII: return "rgmii"; case PHY_INTERFACE_MODE_RGMII_ID: return "rgmii-id"; case PHY_INTERFACE_MODE_RGMII_RXID: return "rgmii-rxid"; case PHY_INTERFACE_MODE_RGMII_TXID: return "rgmii-txid"; case PHY_INTERFACE_MODE_RTBI: return "rtbi"; case PHY_INTERFACE_MODE_SMII: return "smii"; case PHY_INTERFACE_MODE_XGMII: return "xgmii"; case PHY_INTERFACE_MODE_XLGMII: return "xlgmii"; case PHY_INTERFACE_MODE_MOCA: return "moca"; case PHY_INTERFACE_MODE_PSGMII: return "psgmii"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; case PHY_INTERFACE_MODE_TRGMII: return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; case PHY_INTERFACE_MODE_1000BASEKX: return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: return "5gbase-r"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; case PHY_INTERFACE_MODE_10GBASER: return "10gbase-r"; case PHY_INTERFACE_MODE_25GBASER: return "25gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; case PHY_INTERFACE_MODE_QUSGMII: return "qusgmii"; case PHY_INTERFACE_MODE_10G_QXGMII: return "10g-qxgmii"; default: return "unknown"; } } #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" #define MII_BUS_ID_SIZE 61 struct device; struct kernel_hwtstamp_config; struct phylink; struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; /** * struct mdio_bus_stats - Statistics counters for MDIO busses * @transfers: Total number of transfers, i.e. @writes + @reads * @errors: Number of MDIO transfers that returned an error * @writes: Number of write transfers * @reads: Number of read transfers * @syncp: Synchronisation for incrementing statistics */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; u64_stats_t writes; u64_stats_t reads; /* Must be last, add new statistics above */ struct u64_stats_sync syncp; }; /** * struct phy_package_shared - Shared information in PHY packages * @base_addr: Base PHY address of PHY package used to combine PHYs * in one package and for offset calculation of phy_package_read/write * @np: Pointer to the Device Node if PHY package defined in DT * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv * @priv: Driver private data shared across a PHY package * * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ struct phy_package_shared { u8 base_addr; /* With PHY package defined in DT this points to the PHY package node */ struct device_node *np; refcount_t refcnt; unsigned long flags; size_t priv_size; /* private data pointer */ /* note that this pointer is shared between different phydevs and * the user has to take care of appropriate locking. It is allocated * and freed automatically by phy_package_join() and * phy_package_leave(). */ void *priv; }; /* used as bit number in atomic bitops */ #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 /** * struct mii_bus - Represents an MDIO bus * * @owner: Who owns this device * @name: User friendly name for this MDIO device, or driver name * @id: Unique identifier for this bus, typical from bus hierarchy * @priv: Driver private data * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ struct mii_bus { struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); /** @read_c45: Perform a C45 read transfer on the bus */ int (*read_c45)(struct mii_bus *bus, int addr, int devnum, int regnum); /** @write_c45: Perform a C45 write transfer on the bus */ int (*write_c45)(struct mii_bus *bus, int addr, int devnum, int regnum, u16 val); /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; /** * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; /** @parent: Parent device of this bus */ struct device *parent; /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; /** @dev: Kernel device representation */ struct device dev; /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; /** * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) struct mii_bus *mdiobus_alloc_size(size_t size); /** * mdiobus_alloc - Allocate an MDIO bus structure * * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready * for the driver to register the bus. */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); } int __mdiobus_register(struct mii_bus *bus, struct module *owner); int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) #define devm_mdiobus_register(dev, bus) \ __devm_mdiobus_register(dev, bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) { return devm_mdiobus_alloc_size(dev, 0); } struct mii_bus *mdio_find_bus(const char *mdio_name); struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true /** * enum phy_state - PHY state machine states: * * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. * - PHY driver probe function will set the state to @PHY_READY * * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. * - timer moves to @PHY_NOLINK or @PHY_RUNNING * * @PHY_NOLINK: PHY is up, but not currently plugged in. * - irq or timer will set @PHY_RUNNING if link comes back * - phy_stop moves to @PHY_HALTED * * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets * - irq or timer will set @PHY_NOLINK if link goes down * - phy_stop moves to @PHY_HALTED * * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. * - phy_stop aborts the running test and moves to @PHY_HALTED * * @PHY_HALTED: PHY is up, but no polling or interrupts are done. * - phy_start moves to @PHY_UP * * @PHY_ERROR: PHY is up, but is in an error state. * - phy_stop moves to @PHY_HALTED */ enum phy_state { PHY_DOWN = 0, PHY_READY, PHY_HALTED, PHY_ERROR, PHY_UP, PHY_RUNNING, PHY_NOLINK, PHY_CABLETEST, }; #define MDIO_MMD_NUM 32 /** * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. * @device_ids: The device identifer for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; u32 mmds_present; u32 device_ids[MDIO_MMD_NUM]; }; struct macsec_context; struct macsec_ops; /** * struct phy_device - An instance of a PHY * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. * @phyindex: Unique id across the phy's parent tree of phys to address the PHY * from userspace, similar to ifindex. A zero index means the PHY * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. * @is_internal: Set to true if this PHY is internal to a MAC. * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. * @is_gigabit_capable: Set to true if PHY supports 1000Mbps * @has_fixups: Set to true if this PHY has fixups/quirks. * @suspended: Set to true if this PHY has been suspended successfully. * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @is_on_sfp_module: Set true if PHY is located on an SFP module. * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @wol_enabled: Set to true if the PHY or the attached MAC have Wake-on-LAN * enabled. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * * - Bits [15:0] are free to use by the PHY driver to communicate * driver specific behavior. * - Bits [23:16] are currently reserved for future use. * - Bits [31:24] are reserved for defining generic * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port * @attached_dev: The attached enet driver's device instance ptr * @adjust_link: Callback for the enet controller to respond to changes: in the * link state. * @phy_link_change: Callback for phylink for notification of link change * @macsec_ops: MACsec offloading ops. * * @speed: Current link speed * @duplex: Current duplex * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @eee_active: phylib private state, indicating that EEE has been negotiated * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled * @irq_suspended: Flag indicating PHY is suspended and therefore interrupt * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @default_timestamp: Flag indicating whether we are using the phy * timestamp as the default one * @interface: enum phy_interface_t value * @possible_interfaces: bitmap if interface modes that the attached PHY * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics * @phy_led_triggers: Array of LED triggers * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed * @leds: list of PHY LED structures * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling * and disabling specific interrupts * * Contains some infrastructure for polling and interrupt * handling, as well as handling shifts in PHY hardware state */ struct phy_device { struct mdio_device mdio; /* Information about the PHY type */ /* And management functions */ const struct phy_driver *drv; struct device_link *devlink; u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned suspended_by_mdio_bus:1; unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned is_on_sfp_module:1; unsigned mac_managed_pm:1; unsigned wol_enabled:1; unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; unsigned irq_suspended:1; unsigned irq_rerun:1; unsigned default_timestamp:1; int rate_matching; enum phy_state state; u32 dev_flags; phy_interface_t interface; DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) */ int speed; int duplex; int port; int pause; int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); /* Energy efficient ethernet modes which should be prohibited */ __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes); bool enable_tx_lpi; bool eee_active; struct eee_config eee_cfg; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; struct phy_led_trigger *led_link_trigger; #endif struct list_head leds; /* * Interrupt number for this PHY * -1 means no interrupt */ int irq; /* private data pointer */ /* For use by PHYs to maintain extra state */ void *priv; /* shared data pointer */ /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; /* Reporting cable test results */ struct sk_buff *skb; void *ehdr; struct nlattr *nest; /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; struct mutex lock; /* This may be modified under the rtnl lock */ bool sfp_bus_attached; struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; struct pse_control *psec; u8 mdix; u8 mdix_ctrl; int pma_extable; unsigned int link_down_events; void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif }; /* Generic phy_device::dev_flags */ #define PHY_F_NO_IRQ 0x80000000 #define PHY_F_RXC_ALWAYS_ON 0x40000000 static inline struct phy_device *to_phy_device(const struct device *dev) { return container_of(to_mdio_device(dev), struct phy_device, mdio); } /** * struct phy_tdr_config - Configuration of a TDR raw test * * @first: Distance for first data collection point * @last: Distance for last data collection point * @step: Step between data collection points * @pair: Bitmap of cable pairs to collect data for * * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. * All distances are in centimeters. */ struct phy_tdr_config { u32 first; u32 last; u32 step; s8 pair; }; #define PHY_PAIR_ALL -1 /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @version: read-only PLCA register map version. -1 = not available. Ignored * when setting the configuration. Format is the same as reported by the PLCA * IDVER register (31.CA00). -1 = not available. * @enabled: PLCA configured mode (enabled/disabled). -1 = not available / don't * set. 0 = disabled, anything else = enabled. * @node_id: the PLCA local node identifier. -1 = not available / don't set. * Allowed values [0 .. 254]. 255 = node disabled. * @node_cnt: the PLCA node count (maximum number of nodes having a TO). Only * meaningful for the coordinator (node_id = 0). -1 = not available / don't * set. Allowed values [1 .. 255]. * @to_tmr: The value of the PLCA to_timer in bit-times, which determines the * PLCA transmit opportunity window opening. See IEEE802.3 Clause 148 for * more details. The to_timer shall be set equal over all nodes. * -1 = not available / don't set. Allowed values [0 .. 255]. * @burst_cnt: controls how many additional frames a node is allowed to send in * single transmit opportunity (TO). The default value of 0 means that the * node is allowed exactly one frame per TO. A value of 1 allows two frames * per TO, and so on. -1 = not available / don't set. * Allowed values [0 .. 255]. * @burst_tmr: controls how many bit times to wait for the MAC to send a new * frame before interrupting the burst. This value should be set to a value * greater than the MAC inter-packet gap (which is typically 96 bits). * -1 = not available / don't set. Allowed values [0 .. 255]. * * A structure containing configuration parameters for setting/getting the PLCA * RS configuration. The driver does not need to implement all the parameters, * but should report what is actually used. */ struct phy_plca_cfg { int version; int enabled; int node_id; int node_cnt; int to_tmr; int burst_cnt; int burst_tmr; }; /** * struct phy_plca_status - Status of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @pst: The PLCA status as reported by the PST bit in the PLCA STATUS * register(31.CA03), indicating BEACON activity. * * A structure containing status information of the PLCA RS configuration. * The driver does not need to implement all the parameters, but should report * what is actually used. */ struct phy_plca_status { bool pst; }; /* Modes for PHY LED configuration */ enum phy_led_modes { PHY_LED_ACTIVE_HIGH = 0, PHY_LED_ACTIVE_LOW = 1, PHY_LED_INACTIVE_HIGH_IMPEDANCE = 2, /* keep it last */ __PHY_LED_MODES_NUM, }; /** * struct phy_led: An LED driven by the PHY * * @list: List of LEDs * @phydev: PHY this LED is attached to * @led_cdev: Standard LED class structure * @index: Number of the LED */ struct phy_led { struct list_head list; struct phy_device *phydev; struct led_classdev led_cdev; u8 index; }; #define to_phy_led(d) container_of(d, struct phy_led, led_cdev) /** * struct phy_driver - Driver structure for a particular PHY type * * @mdiodrv: Data common to all MDIO devices * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field * @name: The friendly name of this PHY type * @phy_id_mask: Defines the important bits of the phy_id * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. * Note that none of these functions should be called from * interrupt time. The goal is for the bus read/write functions * to be able to block when the bus transaction is happening, * and be freed up by an interrupt (The MPC85xx has this ability, * though it is not currently supported in the driver). */ struct phy_driver { struct mdio_driver_common mdiodrv; u32 phy_id; char *name; u32 phy_id_mask; const unsigned long * const features; u32 flags; const void *driver_data; /** * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); /** * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); /** * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); /** * @get_features: Probe the hardware to determine what * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine * whether to advertise lower-speed modes for that interface. It is * assumed that if a rate matching mode is supported on an interface, * then that interface's rate can be adapted to all slower link speeds * supported by the phy. If the interface is not supported, this should * return %RATE_MATCH_NONE. */ int (*get_rate_matching)(struct phy_device *phydev, phy_interface_t iface); /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); /** * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); /** * @config_intr: Enables or disables interrupts. * It should also clear any pending interrupts prior to enabling the * IRQs and after disabling them. */ int (*config_intr)(struct phy_device *phydev); /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); /** * @match_phy_device: Returns true if this is a suitable * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY * register changes to enable Wake on LAN, so set_wol is * provided to be called in the ethernet driver's set_wol * function. */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @get_wol: See set_wol, but for checking whether Wake on LAN * is enabled. */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @link_change_notify: Called to inform a PHY device driver * when the core is about to change the link state. This * callback is supposed to be used as fixup hook for drivers * that need to take action when the link state * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); /** * @read_mmd: PHY specific driver override for reading a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD read function * will be used by phy_read_mmd(), which will use either a * direct read for Clause 45 PHYs or an indirect read for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); /** * @write_mmd: PHY specific driver override for writing a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD write function * will be used by phy_write_mmd(), which will use either a * direct write for Clause 45 PHYs, or an indirect write for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); /** * @module_info: Get the size and type of the eeprom contained * within a plug-in module */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); /** * @module_eeprom: Get the eeprom information from the plug-in * module */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); /** * @cable_test_get_status: Once per second, or on interrupt, * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); /* PLCA RS interface */ /** @get_plca_cfg: Return the current PLCA configuration */ int (*get_plca_cfg)(struct phy_device *dev, struct phy_plca_cfg *plca_cfg); /** @set_plca_cfg: Set the PLCA configuration */ int (*set_plca_cfg)(struct phy_device *dev, const struct phy_plca_cfg *plca_cfg); /** @get_plca_status: Return the current PLCA status info */ int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); /** * @led_brightness_set: Set a PHY LED brightness. Index * indicates which of the PHYs led should be set. Value * follows the standard LED class meaning, e.g. LED_OFF, * LED_HALF, LED_FULL. */ int (*led_brightness_set)(struct phy_device *dev, u8 index, enum led_brightness value); /** * @led_blink_set: Set a PHY LED blinking. Index indicates * which of the PHYs led should be configured to blink. Delays * are in milliseconds and if both are zero then a sensible * default should be chosen. The call should adjust the * timings in that case and if it can't match the values * specified exactly. */ int (*led_blink_set)(struct phy_device *dev, u8 index, unsigned long *delay_on, unsigned long *delay_off); /** * @led_hw_is_supported: Can the HW support the given rules. * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The core is interested in these rules * * Return 0 if yes, -EOPNOTSUPP if not, or an error code. */ int (*led_hw_is_supported)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_set: Set the HW to control the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The rules used to control the LED * * Returns 0, or a an error code. */ int (*led_hw_control_set)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_get: Get how the HW is controlling the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules Pointer to the rules used to control the LED * * Set *@rules to how the HW is currently blinking. Returns 0 * on success, or a error code if the current blinking cannot * be represented in rules, or some other error happens. */ int (*led_hw_control_get)(struct phy_device *dev, u8 index, unsigned long *rules); /** * @led_polarity_set: Set the LED polarity modes * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @modes: bitmap of LED polarity modes * * Configure LED with all the required polarity modes in @modes * to make it correctly turn ON or OFF. * * Returns 0, or an error code. */ int (*led_polarity_set)(struct phy_device *dev, int index, unsigned long modes); }; #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" #define PHY_ANY_UID 0xffffffff #define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) #define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) #define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) /** * phy_id_compare - compare @id1 with @id2 taking account of @mask * @id1: first PHY ID * @id2: second PHY ID * @mask: the PHY ID mask, set bits are significant in matching * * Return true if the bits from @id1 and @id2 specified by @mask match. * This uses an equivalent test to (@id & @mask) == (@phy_id & @mask). */ static inline bool phy_id_compare(u32 id1, u32 id2, u32 mask) { return !((id1 ^ id2) & mask); } /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device * @id: the PHY ID to be matched * * Compare the @phydev clause 22 ID with the provided @id and return true or * false depending whether it matches, using the bound driver mask. The * @phydev must be bound to a driver. */ static inline bool phydev_id_compare(struct phy_device *phydev, u32 id) { return phy_id_compare(id, phydev->phy_id, phydev->drv->phy_id_mask); } /* A Structure for boards to register fixups with the PHY Lib */ struct phy_fixup { struct list_head list; char bus_id[MII_BUS_ID_SIZE + 3]; u32 phy_uid; u32 phy_uid_mask; int (*run)(struct phy_device *phydev); }; const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ struct phy_setting { u32 speed; u8 duplex; u8 bit; }; const struct phy_setting * phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); void of_set_phy_supported(struct phy_device *phydev); void of_set_phy_eee_broken(struct phy_device *phydev); void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** * phy_set_eee_broken - Mark an EEE mode as broken so that it isn't advertised. * @phydev: The phy_device struct * @link_mode: The broken EEE mode */ static inline void phy_set_eee_broken(struct phy_device *phydev, u32 link_mode) { linkmode_set_bit(link_mode, phydev->eee_broken_modes); } /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct */ static inline bool phy_is_started(struct phy_device *phydev) { return phydev->state >= PHY_UP; } void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); void phy_check_downshift(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * The caller must have taken the MDIO bus lock. */ static inline int __phy_read(struct phy_device *phydev, u32 regnum) { return __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** * phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * The caller must have taken the MDIO bus lock. */ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_modify_changed() - Convenience function for modifying a PHY register * @phydev: a pointer to a &struct phy_device * @regnum: register number * @mask: bit mask of bits to clear * @set: bit mask of bits to set * * Unlocked helper function which allows a PHY register to be modified as * new register value = (old register value & ~mask) | set * * Returns negative errno, 0 if there was no change, and 1 in case of change */ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set) { return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr, regnum, mask, set); } /* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /** * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a * condition is met or a timeout occurs * * @phydev: The phy_device struct * @devaddr: The MMD to read from * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please * read usleep_range() function description for details and * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read_mmd, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, \ phydev, devaddr, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); /* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); /** * __phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, 0, val); } /** * __phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, val, 0); } /** * phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set */ static inline int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, 0, val); } /** * phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear */ static inline int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, val, 0); } /** * __phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * __phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set */ static inline int phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear */ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_interrupt_is_valid - Convenience function for testing a given PHY irq * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and * PHY_MAC_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT; } /** * phy_polling_mode - Convenience function for testing whether polling is * used to detect PHY status changes * @phydev: the phy_device struct */ static inline bool phy_polling_mode(struct phy_device *phydev) { if (phydev->state == PHY_CABLETEST) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; return phydev->irq == PHY_POLL; } /** * phy_has_hwtstamp - Tests whether a PHY time stamp configuration. * @phydev: the phy_device struct */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; } /** * phy_has_rxtstamp - Tests whether a PHY supports receive time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_rxtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp; } /** * phy_has_tsinfo - Tests whether a PHY reports time stamping and/or * PTP hardware clock capabilities. * @phydev: the phy_device struct */ static inline bool phy_has_tsinfo(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->ts_info; } /** * phy_has_txtstamp - Tests whether a PHY supports transmit time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_txtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } static inline int phy_hwtstamp(struct phy_device *phydev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type); } static inline int phy_ts_info(struct phy_device *phydev, struct kernel_ethtool_ts_info *tsinfo) { return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } /** * phy_is_default_hwtstamp - Is the PHY hwtstamp the default timestamp * @phydev: Pointer to phy_device * * This is used to get default timestamping device taking into account * the new API choice, which is selecting the timestamping from MAC by * default if the phydev does not have default_timestamp flag enabled. * * Return: True if phy is the default hw timestamp, false otherwise. */ static inline bool phy_is_default_hwtstamp(struct phy_device *phydev) { return phy_has_hwtstamp(phydev) && phydev->default_timestamp; } /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct */ static inline bool phy_is_internal(struct phy_device *phydev) { return phydev->is_internal; } /** * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module * @phydev: the phy_device struct */ static inline bool phy_on_sfp(struct phy_device *phydev) { return phydev->is_on_sfp_module; } /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { return mode >= PHY_INTERFACE_MODE_RGMII && mode <= PHY_INTERFACE_MODE_RGMII_TXID; }; /** * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) { return mode == PHY_INTERFACE_MODE_1000BASEX || mode == PHY_INTERFACE_MODE_2500BASEX; } /** * phy_interface_is_rgmii - Convenience function for testing if a PHY interface * is RGMII (all variants) * @phydev: the phy_device struct */ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) { return phy_interface_mode_is_rgmii(phydev->interface); }; /** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct */ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) { return phydev->is_pseudo_fixed_link; } int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); int phy_read_paged(struct phy_device *phydev, int page, u32 regnum); int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val); int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); #if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); #else static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) { return 0; } static inline struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) { return 0; } static inline struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) { return NULL; } static inline struct phy_device *device_phy_find_device(struct device *dev) { return NULL; } static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { return NULL; } static inline struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) { return NULL; } static inline int phy_device_register(struct phy_device *phy) { return 0; } static inline void phy_device_free(struct phy_device *phydev) { } #endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), phy_interface_t interface); struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, void (*handler)(struct net_device *), phy_interface_t interface); void phy_disconnect(struct phy_device *phydev); void phy_detach(struct phy_device *phydev); void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } #endif static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); } #define phydev_err(_phydev, format, args...) \ dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_err_probe(_phydev, err, format, args...) \ dev_err_probe(&_phydev->mdio.dev, err, format, ##args) #define phydev_info(_phydev, format, args...) \ dev_info(&_phydev->mdio.dev, format, ##args) #define phydev_warn(_phydev, format, args...) \ dev_warn(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { return dev_name(&phydev->mdio.dev); } static inline void phy_lock_mdio_bus(struct phy_device *phydev) { mutex_lock(&phydev->mdio.bus->mdio_lock); } static inline void phy_unlock_mdio_bus(struct phy_device *phydev) { mutex_unlock(&phydev->mdio.bus->mdio_lock); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); int genphy_soft_reset(struct phy_device *phydev); irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev); static inline int genphy_config_aneg(struct phy_device *phydev) { return __genphy_config_aneg(phydev, false); } static inline int genphy_no_config_intr(struct phy_device *phydev) { return 0; } int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum); int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); /* Clause 37 */ int genphy_c37_config_aneg(struct phy_device *phydev); int genphy_c37_read_status(struct phy_device *phydev, bool *changed); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_c45_aneg_done(struct phy_device *phydev); int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); int genphy_c45_pma_baset1_setup_master_slave(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); int genphy_c45_pma_suspend(struct phy_device *phydev); int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable); int genphy_c45_plca_get_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_set_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, unsigned long *lp, bool *is_enabled); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); static inline int phy_read_status(struct phy_device *phydev) { if (!phydev->drv) return -EIO; if (phydev->drv->read_status) return phydev->drv->read_status(phydev); else return genphy_read_status(phydev); } void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); void phy_state_machine(struct work_struct *work); void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_trigger_machine(struct phy_device *phydev); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, const struct ethtool_link_ksettings *cmd); int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); int phy_get_rate_matching(struct phy_device *phydev, phy_interface_t iface); void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_support_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); bool phy_validate_pause(struct phy_device *phydev, struct ethtool_pauseparam *pp); void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause); s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx); void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_register_fixup_for_id(const char *bus_id, int (*run)(struct phy_device *)); int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); int phy_ethtool_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd); int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); int of_phy_package_join(struct phy_device *phydev, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int base_addr, size_t priv_size); int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg, struct netlink_ext_ack *extack); int phy_ethtool_get_plca_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int __phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config); int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); static inline int phy_package_address(struct phy_device *phydev, unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; u8 base_addr = shared->base_addr; if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; /* we know that addr will be in the range 0..31 and thus the * implicit cast to a signed int is not a problem. */ return base_addr + addr_offset; } static inline int phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int __phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } int __phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int __phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); int phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { struct phy_package_shared *shared = phydev->shared; if (!shared) return false; return !test_and_set_bit(b, &shared->flags); } static inline bool phy_package_init_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE); } static inline bool phy_package_probe_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); } extern const struct bus_type mdio_bus_type; struct mdio_board_info { const char *bus_id; char modalias[MDIO_NAME_SIZE]; int mdio_addr; const void *platform_data; }; #if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); #else static inline int mdiobus_register_board_info(const struct mdio_board_info *i, unsigned int n) { return 0; } #endif /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ { \ phy_drivers_unregister(__phy_drivers, __count); \ } \ module_exit(phy_module_exit) #define module_phy_driver(__phy_drivers) \ phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers)) bool phy_driver_is_genphy(struct phy_device *phydev); bool phy_driver_is_genphy_10g(struct phy_device *phydev); #endif /* __PHY_H */
1 6 1 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ #include "main.h" #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table * @data: mac address of the originator node * @size: the size of the hash table * * Return: the hash index where the object represented by @data should be * stored at. */ static inline u32 batadv_choose_orig(const void *data, u32 size) { u32 hash = 0; hash = jhash(data, ETH_ALEN, hash); return hash % size; } struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); /** * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ static inline void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { if (!orig_vlan) return; kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } /** * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ static inline void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (!neigh_ifinfo) return; kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ static inline void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { if (!hardif_neigh) return; kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { if (!neigh_node) return; kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ static inline void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { if (!orig_ifinfo) return; kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** * batadv_orig_node_put() - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) { if (!orig_node) return; kref_put(&orig_node->refcount, batadv_orig_node_release); } #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 // SPDX-License-Identifier: GPL-2.0-only /* * ebt_pkttype * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2003 * */ #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_pkttype.h> static bool ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_pkttype_info *info = par->matchinfo; return (skb->pkt_type == info->pkt_type) ^ info->invert; } static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par) { const struct ebt_pkttype_info *info = par->matchinfo; if (info->invert != 0 && info->invert != 1) return -EINVAL; /* Allow any pkt_type value */ return 0; } static struct xt_match ebt_pkttype_mt_reg __read_mostly = { .name = "pkttype", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_pkttype_mt, .checkentry = ebt_pkttype_mt_check, .matchsize = sizeof(struct ebt_pkttype_info), .me = THIS_MODULE, }; static int __init ebt_pkttype_init(void) { return xt_register_match(&ebt_pkttype_mt_reg); } static void __exit ebt_pkttype_fini(void) { xt_unregister_match(&ebt_pkttype_mt_reg); } module_init(ebt_pkttype_init); module_exit(ebt_pkttype_fini); MODULE_DESCRIPTION("Ebtables: Link layer packet type match"); MODULE_LICENSE("GPL");
182 20 20 13 17 13 15 16 16 15 27 7 28 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for Network protocols. * * Definitions for request_sock * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #ifndef _REQUEST_SOCK_H #define _REQUEST_SOCK_H #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/refcount.h> #include <net/sock.h> #include <net/rstreason.h> struct request_sock; struct sk_buff; struct dst_entry; struct proto; struct request_sock_ops { int family; unsigned int obj_size; struct kmem_cache *slab; char *slab_name; int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); }; int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; }; /* struct request_sock - mini sock to represent a connection request */ struct request_sock { struct sock_common __req_common; #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener #define rsk_window_clamp __req_common.skc_window_clamp #define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* True if * 1) tcpopts needs to be encoded in * TS of SYN+ACK * 2) ACK is validated by BPF kfunc. */ u8 num_timeout:7; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; struct saved_syn *saved_syn; u32 secid; u32 peer_secid; u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) { return (struct request_sock *)sk; } static inline struct sock *req_to_sk(struct request_sock *req) { return (struct sock *)req; } /** * skb_steal_sock - steal a socket from an sk_buff * @skb: sk_buff to steal the socket from * @refcounted: is set to true if the socket is reference-counted * @prefetched: is set to true if the socket was assigned from bpf */ static inline struct sock *skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) { struct sock *sk = skb->sk; if (!sk) { *prefetched = false; *refcounted = false; return NULL; } *prefetched = skb_sk_is_prefetched(skb); if (*prefetched) { #if IS_ENABLED(CONFIG_SYN_COOKIES) if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { struct request_sock *req = inet_reqsk(sk); *refcounted = false; sk = req->rsk_listener; req->rsk_listener = NULL; return sk; } #endif *refcounted = sk_is_refcounted(sk); } else { *refcounted = true; } skb->destructor = NULL; skb->sk = NULL; return sk; } static inline void __reqsk_free(struct request_sock *req) { req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); kfree(req->saved_syn); kmem_cache_free(req->rsk_ops->slab, req); } static inline void reqsk_free(struct request_sock *req) { DEBUG_NET_WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); __reqsk_free(req); } static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) __reqsk_free(req); } /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by * the listener and the child socket. * qlen - pending TFO requests (still in TCP_SYN_RECV). * max_qlen - max TFO reqs allowed before TFO is disabled. * * XXX (TFO) - ideally these fields can be made as part of "listen_sock" * structure above. But there is some implementation difficulty due to * listen_sock being part of request_sock_queue hence will be freed when * a listener is stopped. But TFO related fields may continue to be * accessed even after a listener is closed, until its sk_refcnt drops * to 0 implying no more outstanding TFO reqs. One solution is to keep * listen_opt around until sk_refcnt drops to 0. But there is some other * complexity that needs to be resolved. E.g., a listener can be disabled * temporarily through shutdown()->tcp_disconnect(), and re-enabled later. */ struct fastopen_queue { struct request_sock *rskq_rst_head; /* Keep track of past TFO */ struct request_sock *rskq_rst_tail; /* requests that caused RST. * This is part of the defense * against spoofing attack. */ spinlock_t lock; int qlen; /* # of pending (TCP_SYN_RECV) reqs */ int max_qlen; /* != 0 iff TFO is currently enabled */ struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */ }; /** struct request_sock_queue - queue of request_socks * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() * */ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; u32 synflood_warned; atomic_t qlen; atomic_t young; struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ }; void reqsk_queue_alloc(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, struct sock *parent) { struct request_sock *req; spin_lock_bh(&queue->rskq_lock); req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } spin_unlock_bh(&queue->rskq_lock); return req; } static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_dec(&queue->young); atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return atomic_read(&queue->young); } /* RFC 7323 2.3 Using the Window Scale Option * The window field (SEG.WND) of every outgoing segment, with the * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits. * * This means the SEG.WND carried in SYNACK can not exceed 65535. * We use this property to harden TCP stack while in NEW_SYN_RECV state. */ static inline u32 tcp_synack_window(const struct request_sock *req) { return min(req->rsk_rcv_wnd, 65535U); } #endif /* _REQUEST_SOCK_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: pep.h * * Phonet Pipe End Point sockets definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H #include <linux/skbuff.h> #include <net/phonet/phonet.h> struct pep_sock { struct pn_sock pn_sk; /* XXX: union-ify listening vs connected stuff ? */ /* Listening socket stuff: */ struct hlist_head hlist; /* Connected socket stuff: */ struct sock *listener; struct sk_buff_head ctrlreq_queue; #define PNPIPE_CTRLREQ_MAX 10 atomic_t tx_credits; int ifindex; u16 peer_type; /* peer type/subtype */ u8 pipe_handle; u8 rx_credits; u8 rx_fc; /* RX flow control */ u8 tx_fc; /* TX flow control */ u8 init_enable; /* auto-enable at creation */ u8 aligned; }; static inline struct pep_sock *pep_sk(struct sock *sk) { return (struct pep_sock *)sk; } extern const struct proto_ops phonet_stream_ops; /* Pipe protocol definitions */ struct pnpipehdr { u8 utid; /* transaction ID */ u8 message_id; u8 pipe_handle; union { u8 state_after_connect; /* connect request */ u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ u8 data0; /* anything else */ }; u8 data[]; }; #define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { return (struct pnpipehdr *)skb_transport_header(skb); } #define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4) enum { PNS_PIPE_CREATE_REQ = 0x00, PNS_PIPE_CREATE_RESP, PNS_PIPE_REMOVE_REQ, PNS_PIPE_REMOVE_RESP, PNS_PIPE_DATA = 0x20, PNS_PIPE_ALIGNED_DATA, PNS_PEP_CONNECT_REQ = 0x40, PNS_PEP_CONNECT_RESP, PNS_PEP_DISCONNECT_REQ, PNS_PEP_DISCONNECT_RESP, PNS_PEP_RESET_REQ, PNS_PEP_RESET_RESP, PNS_PEP_ENABLE_REQ, PNS_PEP_ENABLE_RESP, PNS_PEP_CTRL_REQ, PNS_PEP_CTRL_RESP, PNS_PEP_DISABLE_REQ = 0x4C, PNS_PEP_DISABLE_RESP, PNS_PEP_STATUS_IND = 0x60, PNS_PIPE_CREATED_IND, PNS_PIPE_RESET_IND = 0x63, PNS_PIPE_ENABLED_IND, PNS_PIPE_REDIRECTED_IND, PNS_PIPE_DISABLED_IND = 0x66, }; #define PN_PIPE_INVALID_HANDLE 0xff #define PN_PEP_TYPE_COMMON 0x00 /* Phonet pipe status indication */ enum { PN_PEP_IND_FLOW_CONTROL, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, }; /* Phonet pipe error codes */ enum { PN_PIPE_NO_ERROR, PN_PIPE_ERR_INVALID_PARAM, PN_PIPE_ERR_INVALID_HANDLE, PN_PIPE_ERR_INVALID_CTRL_ID, PN_PIPE_ERR_NOT_ALLOWED, PN_PIPE_ERR_PEP_IN_USE, PN_PIPE_ERR_OVERLOAD, PN_PIPE_ERR_DEV_DISCONNECTED, PN_PIPE_ERR_TIMEOUT, PN_PIPE_ERR_ALL_PIPES_IN_USE, PN_PIPE_ERR_GENERAL, PN_PIPE_ERR_NOT_SUPPORTED, }; /* Phonet pipe states */ enum { PN_PIPE_DISABLE, PN_PIPE_ENABLE, }; /* Phonet pipe sub-block types */ enum { PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE, PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE, PN_PIPE_SB_NEGOTIATED_FC, PN_PIPE_SB_REQUIRED_FC_TX, PN_PIPE_SB_PREFERRED_FC_RX, PN_PIPE_SB_ALIGNED_DATA, }; /* Phonet pipe flow control models */ enum { PN_NO_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_MULTI_CREDIT_FLOW_CONTROL, PN_MAX_FLOW_CONTROL, }; #define pn_flow_safe(fc) ((fc) >> 1) /* Phonet pipe flow control states */ enum { PEP_IND_EMPTY, PEP_IND_BUSY, PEP_IND_READY, }; #endif
886 879 1 1 1 1 8 1 4 3 6 1 7 1 6 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE");
8 1 1 1 2 2 1 1 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-or-later /* * Description: EBTables 802.1Q match extension kernelspace module. * Authors: Nick Fedchik <nick@fedchik.org.ua> * Bart De Schuymer <bdschuym@pandora.be> */ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_vlan.h> #define MODULE_VERS "0.6" MODULE_AUTHOR("Nick Fedchik <nick@fedchik.org.ua>"); MODULE_DESCRIPTION("Ebtables: 802.1Q VLAN tag match"); MODULE_LICENSE("GPL"); #define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_ #define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; } static bool ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_vlan_info *info = par->matchinfo; unsigned short TCI; /* Whole TCI, given from parsed frame */ unsigned short id; /* VLAN ID, given from frame TCI */ unsigned char prio; /* user_priority, given from frame TCI */ /* VLAN encapsulated Type/Length field, given from orig frame */ __be16 encap; if (skb_vlan_tag_present(skb)) { TCI = skb_vlan_tag_get(skb); encap = skb->protocol; } else { const struct vlan_hdr *fp; struct vlan_hdr _frame; fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); if (fp == NULL) return false; TCI = ntohs(fp->h_vlan_TCI); encap = fp->h_vlan_encapsulated_proto; } /* Tag Control Information (TCI) consists of the following elements: * - User_priority. The user_priority field is three bits in length, * interpreted as a binary number. * - Canonical Format Indicator (CFI). The Canonical Format Indicator * (CFI) is a single bit flag value. Currently ignored. * - VLAN Identifier (VID). The VID is encoded as * an unsigned binary number. */ id = TCI & VLAN_VID_MASK; prio = (TCI >> 13) & 0x7; /* Checking VLAN Identifier (VID) */ if (GET_BITMASK(EBT_VLAN_ID)) EXIT_ON_MISMATCH(id, EBT_VLAN_ID); /* Checking user_priority */ if (GET_BITMASK(EBT_VLAN_PRIO)) EXIT_ON_MISMATCH(prio, EBT_VLAN_PRIO); /* Checking Encapsulated Proto (Length/Type) field */ if (GET_BITMASK(EBT_VLAN_ENCAP)) EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP); return true; } static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) { struct ebt_vlan_info *info = par->matchinfo; const struct ebt_entry *e = par->entryinfo; /* Is it 802.1Q frame checked? */ if (e->ethproto != htons(ETH_P_8021Q)) { pr_debug("passed entry proto %2.4X is not 802.1Q (8100)\n", ntohs(e->ethproto)); return -EINVAL; } /* Check for bitmask range * True if even one bit is out of mask */ if (info->bitmask & ~EBT_VLAN_MASK) { pr_debug("bitmask %2X is out of mask (%2X)\n", info->bitmask, EBT_VLAN_MASK); return -EINVAL; } /* Check for inversion flags range */ if (info->invflags & ~EBT_VLAN_MASK) { pr_debug("inversion flags %2X is out of mask (%2X)\n", info->invflags, EBT_VLAN_MASK); return -EINVAL; } /* Reserved VLAN ID (VID) values * ----------------------------- * 0 - The null VLAN ID. * 1 - The default Port VID (PVID) * 0x0FFF - Reserved for implementation use. * if_vlan.h: VLAN_N_VID 4096. */ if (GET_BITMASK(EBT_VLAN_ID)) { if (!!info->id) { /* if id!=0 => check vid range */ if (info->id > VLAN_N_VID) { pr_debug("id %d is out of range (1-4096)\n", info->id); return -EINVAL; } /* Note: This is valid VLAN-tagged frame point. * Any value of user_priority are acceptable, * but should be ignored according to 802.1Q Std. * So we just drop the prio flag. */ info->bitmask &= ~EBT_VLAN_PRIO; } /* Else, id=0 (null VLAN ID) => user_priority range (any?) */ } if (GET_BITMASK(EBT_VLAN_PRIO)) { if ((unsigned char) info->prio > 7) { pr_debug("prio %d is out of range (0-7)\n", info->prio); return -EINVAL; } } /* Check for encapsulated proto range - it is possible to be * any value for u_short range. * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */ if (GET_BITMASK(EBT_VLAN_ENCAP)) { if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) { pr_debug("encap frame length %d is less than " "minimal\n", ntohs(info->encap)); return -EINVAL; } } return 0; } static struct xt_match ebt_vlan_mt_reg __read_mostly = { .name = "vlan", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_vlan_mt, .checkentry = ebt_vlan_mt_check, .matchsize = sizeof(struct ebt_vlan_info), .me = THIS_MODULE, }; static int __init ebt_vlan_init(void) { pr_debug("ebtables 802.1Q extension module v" MODULE_VERS "\n"); return xt_register_match(&ebt_vlan_mt_reg); } static void __exit ebt_vlan_fini(void) { xt_unregister_match(&ebt_vlan_mt_reg); } module_init(ebt_vlan_init); module_exit(ebt_vlan_fini);
3 1 2 4 2 4 8 2 9 2 29 6 12 13 2 10 9 6 6 6 1 6 1 1 1 1 6 5 6 4 2 1 5 5 3 1 2 3 21 21 7 7 2 11 9 4 2 4 1 1 5 6 11 4 4 4 4 1 1 12 1 10 9 6 4 1 3 2 1 1 2 1 3 1 8 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> static int bpf_mprog_link(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_link *link = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); if (IS_ERR(link)) return PTR_ERR(link); if (type && link->prog->type != type) { bpf_link_put(link); return -EINVAL; } tuple->link = link; tuple->prog = link->prog; return 0; } static int bpf_mprog_prog(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_prog *prog = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (type && prog->type != type) { bpf_prog_put(prog); return -EINVAL; } tuple->link = NULL; tuple->prog = prog; return 0; } static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { bool link = flags & BPF_F_LINK; bool id = flags & BPF_F_ID; memset(tuple, 0, sizeof(*tuple)); if (link) return bpf_mprog_link(tuple, id_or_fd, flags, type); /* If no relevant flag is set and no id_or_fd was passed, then * tuple link/prog is just NULLed. This is the case when before/ * after selects first/last position without passing fd. */ if (!id && !id_or_fd) return 0; return bpf_mprog_prog(tuple, id_or_fd, flags, type); } static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) { if (tuple->link) bpf_link_put(tuple->link); else if (tuple->prog) bpf_prog_put(tuple->prog); } /* The bpf_mprog_{replace,delete}() operate on exact idx position with the * one exception that for deletion we support delete from front/back. In * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). * Adjustment to first and last entry is trivial. The bpf_mprog_insert() * we have to deal with the following cases: * * idx + before: * * Insert P4 before P3: idx for old array is 1, idx for new array is 2, * hence we adjust target idx for the new array, so that memmove copies * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting * before P1 would have old idx -1 and new idx 0. * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * * idx + after: * * Insert P4 after P2: idx for old array is 2, idx for new array is 2. * Again, memmove copies P1 and P2 to the new entry, and we insert P4 * into idx 2. Inserting after P3 would have both old/new idx at 4 aka * bpf_mprog_total(entry). * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ */ static int bpf_mprog_replace(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *oprog; bpf_mprog_read(entry, idx, &fp, &cp); oprog = READ_ONCE(fp->prog); bpf_mprog_write(fp, cp, ntuple); if (!ntuple->link) { WARN_ON_ONCE(cp->link); bpf_prog_put(oprog); } *entry_new = entry; return 0; } static int bpf_mprog_insert(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx, u32 flags) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == total) goto insert; else if (flags & BPF_F_BEFORE) idx += 1; bpf_mprog_entry_grow(peer, idx); insert: bpf_mprog_read(peer, idx, &fp, &cp); bpf_mprog_write(fp, cp, ntuple); bpf_mprog_inc(peer); *entry_new = peer; return 0; } static int bpf_mprog_delete(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *dtuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_entry_shrink(peer, idx); bpf_mprog_dec(peer); bpf_mprog_mark_for_release(peer, dtuple); *entry_new = peer; return 0; } /* In bpf_mprog_pos_*() we evaluate the target position for the BPF * program/link that needs to be replaced, inserted or deleted for * each "rule" independently. If all rules agree on that position * or existing element, then enact replacement, addition or deletion. * If this is not the case, then the request cannot be satisfied and * we bail out with an error. */ static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog)) return tuple->link == cp->link ? i : -EBUSY; } return -ENOENT; } static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i - 1; } return tuple->prog ? -ENOENT : -1; } static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i + 1; } return tuple->prog ? -ENOENT : bpf_mprog_total(entry); } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, ntuple = { .prog = prog_new, .link = link, }, otuple = { .prog = prog_old, .link = link, }; int ret, idx = -ERANGE, tidx; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (bpf_mprog_exists(entry, prog_new)) return -EEXIST; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags & ~BPF_F_REPLACE, prog_new->type); if (ret) return ret; if (flags & BPF_F_REPLACE) { tidx = bpf_mprog_pos_exact(entry, &otuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_REPLACE) ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); else ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); out: bpf_mprog_tuple_put(&rtuple); return ret; } static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_cp *cp; struct bpf_mprog_fp *fp; struct bpf_prog *prog; struct bpf_link *link; if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_read(entry, idx, &fp, &cp); prog = READ_ONCE(fp->prog); link = cp->link; /* The deletion request can either be without filled tuple in which * case it gets populated here based on idx, or with filled tuple * where the only thing we end up doing is the WARN_ON_ONCE() assert. * If we hit a BPF link at the given index, it must not be removed * from opts path. */ if (link && !tuple->link) return -EBUSY; WARN_ON_ONCE(tuple->prog && tuple->prog != prog); WARN_ON_ONCE(tuple->link && tuple->link != link); tuple->prog = prog; tuple->link = link; return 0; } int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, dtuple = { .prog = prog, .link = link, }; int ret, idx = -ERANGE, tidx; if (flags & BPF_F_REPLACE) return -EINVAL; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (!bpf_mprog_total(entry)) return -ENOENT; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, prog ? prog->type : BPF_PROG_TYPE_UNSPEC); if (ret) return ret; if (dtuple.prog) { tidx = bpf_mprog_pos_exact(entry, &dtuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } ret = bpf_mprog_fetch(entry, &dtuple, idx); if (ret) goto out; ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); out: bpf_mprog_tuple_put(&rtuple); return ret; } int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry) { u32 __user *uprog_flags, *ulink_flags; u32 __user *uprog_id, *ulink_id; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; u32 id, count = 0; u64 revision = 1; int i, ret = 0; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; if (entry) { revision = bpf_mprog_revision(entry); count = bpf_mprog_total(entry); } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (copy_to_user(&uattr->query.count, &count, sizeof(count))) return -EFAULT; uprog_id = u64_to_user_ptr(attr->query.prog_ids); uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); ulink_id = u64_to_user_ptr(attr->query.link_ids); ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); if (attr->query.count == 0 || !uprog_id || !count) return 0; if (attr->query.count < count) { count = attr->query.count; ret = -ENOSPC; } for (i = 0; i < bpf_mprog_max(); i++) { bpf_mprog_read(entry, i, &fp, &cp); prog = READ_ONCE(fp->prog); if (!prog) break; id = prog->aux->id; if (copy_to_user(uprog_id + i, &id, sizeof(id))) return -EFAULT; if (uprog_flags && copy_to_user(uprog_flags + i, &flags, sizeof(flags))) return -EFAULT; id = cp->link ? cp->link->id : 0; if (ulink_id && copy_to_user(ulink_id + i, &id, sizeof(id))) return -EFAULT; if (ulink_flags && copy_to_user(ulink_flags + i, &flags, sizeof(flags))) return -EFAULT; if (i + 1 == count) break; } return ret; }
1 1 1 1 1 1 1 1 1 44 44 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Authors: Sjur Brendeland * Daniel Martensson */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/sched.h> #include <linux/sockios.h> #include <linux/caif/if_caif.h> #include <net/rtnetlink.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> /* GPRS PDP connection has MTU to 1500 */ #define GPRS_PDP_MTU 1500 /* 5 sec. connect timeout */ #define CONNECT_TIMEOUT (5 * HZ) #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #define UNDEF_CONNID 0xffffffff /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("caif"); enum caif_states { CAIF_CONNECTED = 1, CAIF_CONNECTING, CAIF_DISCONNECTED, CAIF_SHUTDOWN }; struct chnl_net { struct cflayer chnl; struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; wait_queue_head_t netmgmt_wq; /* Flow status to remember and control the transmission. */ bool flowenabled; enum caif_states state; }; static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; struct chnl_net *priv; int pktlen; const u8 *ip_version; u8 buf; priv = container_of(layr, struct chnl_net, chnl); skb = (struct sk_buff *) cfpkt_tonative(pkt); /* Get length of CAIF packet. */ pktlen = skb->len; /* Pass some minimum information and * send the packet to the net stack. */ skb->dev = priv->netdev; /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); if (!ip_version) { kfree_skb(skb); return -EINVAL; } switch (*ip_version >> 4) { case 4: skb->protocol = htons(ETH_P_IP); break; case 6: skb->protocol = htons(ETH_P_IPV6); break; default: kfree_skb(skb); priv->netdev->stats.rx_errors++; return -EINVAL; } /* If we change the header in loop mode, the checksum is corrupted. */ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb->ip_summed = CHECKSUM_NONE; netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; priv->netdev->stats.rx_bytes += pktlen; return 0; } static int delete_device(struct chnl_net *dev) { ASSERT_RTNL(); if (dev->netdev) unregister_netdevice(dev->netdev); return 0; } static void close_work(struct work_struct *work) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); if (dev->state == CAIF_SHUTDOWN) dev_close(dev->netdev); } rtnl_unlock(); } static DECLARE_WORK(close_worker, close_work); static void chnl_hold(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_hold(priv->netdev); } static void chnl_put(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_put(priv->netdev); } static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" : flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" : flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ? "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND"); switch (flow) { case CAIF_CTRLCMD_FLOW_OFF_IND: priv->flowenabled = false; netif_stop_queue(priv->netdev); break; case CAIF_CTRLCMD_DEINIT_RSP: priv->state = CAIF_DISCONNECTED; break; case CAIF_CTRLCMD_INIT_FAIL_RSP: priv->state = CAIF_DISCONNECTED; wake_up_interruptible(&priv->netmgmt_wq); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: priv->state = CAIF_SHUTDOWN; netif_tx_disable(priv->netdev); schedule_work(&close_worker); break; case CAIF_CTRLCMD_FLOW_ON_IND: priv->flowenabled = true; netif_wake_queue(priv->netdev); break; case CAIF_CTRLCMD_INIT_RSP: caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put); priv->state = CAIF_CONNECTED; priv->flowenabled = true; netif_wake_queue(priv->netdev); wake_up_interruptible(&priv->netmgmt_wq); break; default: break; } } static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; int len; int result = -1; /* Get our private data. */ priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { pr_warn("Size of skb exceeded MTU\n"); kfree_skb(skb); dev->stats.tx_errors++; return NETDEV_TX_OK; } if (!priv->flowenabled) { pr_debug("dropping packets flow off\n"); kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* Store original SKB length. */ len = skb->len; pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb); /* Send the packet down the stack. */ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); if (result) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } /* Update statistics. */ dev->stats.tx_packets++; dev->stats.tx_bytes += len; return NETDEV_TX_OK; } static int chnl_net_open(struct net_device *dev) { struct chnl_net *priv = NULL; int result = -1; int llifindex, headroom, tailroom, mtu; struct net_device *lldev; ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } if (priv->state != CAIF_CONNECTING) { priv->state = CAIF_CONNECTING; result = caif_connect_client(dev_net(dev), &priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { pr_debug("err: " "Unable to register and open device," " Err:%d\n", result); goto error; } lldev = __dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { pr_debug("no interface?\n"); result = -ENODEV; goto error; } dev->needed_tailroom = tailroom + lldev->needed_tailroom; dev->hard_header_len = headroom + lldev->hard_header_len + lldev->needed_tailroom; /* * MTU, head-room etc is not know before we have a * CAIF link layer device available. MTU calculation may * override initial RTNL configuration. * MTU is minimum of current mtu, link layer mtu pluss * CAIF head and tail, and PDP GPRS contexts max MTU. */ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); mtu = min_t(int, GPRS_PDP_MTU, mtu); dev_set_mtu(dev, mtu); if (mtu < 100) { pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } } rtnl_unlock(); /* Release RTNL lock during connect wait */ result = wait_event_interruptible_timeout(priv->netmgmt_wq, priv->state != CAIF_CONNECTING, CONNECT_TIMEOUT); rtnl_lock(); if (result == -ERESTARTSYS) { pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { pr_debug("connect timeout\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(dev_net(dev), &priv->chnl); priv->state = CAIF_DISCONNECTED; pr_debug("state disconnected\n"); return result; } static int chnl_net_stop(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); priv->state = CAIF_DISCONNECTED; caif_disconnect_client(dev_net(dev), &priv->chnl); return 0; } static int chnl_net_init(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); INIT_LIST_HEAD(&priv->list_field); return 0; } static void chnl_net_uninit(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { .ndo_open = chnl_net_open, .ndo_stop = chnl_net_stop, .ndo_init = chnl_net_init, .ndo_uninit = chnl_net_uninit, .ndo_start_xmit = chnl_net_start_xmit, }; static void chnl_net_destructor(struct net_device *dev) { struct chnl_net *priv = netdev_priv(dev); caif_free_client(&priv->chnl); } static void ipcaif_net_setup(struct net_device *dev) { struct chnl_net *priv; dev->netdev_ops = &netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = chnl_net_destructor; dev->flags |= IFF_NOARP; dev->flags |= IFF_POINTOPOINT; dev->mtu = GPRS_PDP_MTU; dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN; priv = netdev_priv(dev); priv->chnl.receive = chnl_recv_cb; priv->chnl.ctrlcmd = chnl_flowctrl_cb; priv->netdev = dev; priv->conn_req.protocol = CAIFPROTO_DATAGRAM; priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW; priv->conn_req.priority = CAIF_PRIO_LOW; /* Insert illegal value */ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID; priv->flowenabled = false; init_waitqueue_head(&priv->netmgmt_wq); } static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct chnl_net *priv; u8 loop; priv = netdev_priv(dev); if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id) || nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id)) goto nla_put_failure; loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP; if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]); if (data[IFLA_CAIF_IPV6_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]); if (data[IFLA_CAIF_LOOPBACK]) { if (nla_get_u8(data[IFLA_CAIF_LOOPBACK])) conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP; else conn_req->protocol = CAIFPROTO_DATAGRAM; } } static int ipcaif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { int ret; struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); ret = register_netdevice(dev); if (ret) pr_warn("device rtml registration failed\n"); else list_add(&caifdev->list_field, &chnl_net_list); /* Use ifindex as connection id, and use loopback channel default. */ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) { caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex; caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP; } return ret; } static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); netdev_state_change(dev); return 0; } static size_t ipcaif_get_size(const struct net_device *dev) { return /* IFLA_CAIF_IPV4_CONNID */ nla_total_size(4) + /* IFLA_CAIF_IPV6_CONNID */ nla_total_size(4) + /* IFLA_CAIF_LOOPBACK */ nla_total_size(2) + 0; } static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = { [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 } }; static struct rtnl_link_ops ipcaif_link_ops __read_mostly = { .kind = "caif", .priv_size = sizeof(struct chnl_net), .setup = ipcaif_net_setup, .maxtype = IFLA_CAIF_MAX, .policy = ipcaif_policy, .newlink = ipcaif_newlink, .changelink = ipcaif_changelink, .get_size = ipcaif_get_size, .fill_info = ipcaif_fill_info, }; static int __init chnl_init_module(void) { return rtnl_link_register(&ipcaif_link_ops); } static void __exit chnl_exit_module(void) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_link_unregister(&ipcaif_link_ops); rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); list_del_init(list_node); delete_device(dev); } rtnl_unlock(); } module_init(chnl_init_module); module_exit(chnl_exit_module);
51 53 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * * Originally from kernel/power/process.c */ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/kthread.h> /* total number of freezing conditions in effect */ DEFINE_STATIC_KEY_FALSE(freezer_active); EXPORT_SYMBOL(freezer_active); /* * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; bool pm_nosig_freezing; /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); /** * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. */ bool freezing_slow_path(struct task_struct *p) { if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) return true; return false; } EXPORT_SYMBOL(freezing_slow_path); bool frozen(struct task_struct *p) { return READ_ONCE(p->__state) & TASK_FROZEN; } /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { unsigned int state = get_current_state(); bool was_frozen = false; pr_debug("%s entered refrigerator\n", current->comm); WARN_ON_ONCE(state && !(state & TASK_NORMAL)); for (;;) { bool freeze; raw_spin_lock_irq(&current->pi_lock); WRITE_ONCE(current->__state, TASK_FROZEN); /* unstale saved_state so that __thaw_task() will wake us up */ current->saved_state = TASK_RUNNING; raw_spin_unlock_irq(&current->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); if (!freeze) break; was_frozen = true; schedule(); } __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); return was_frozen; } EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; if (lock_task_sighand(p, &flags)) { signal_wake_up(p, 0); unlock_task_sighand(p, &flags); } } static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); /* * Allow freezing the sched_delayed tasks; they will not execute until * ttwu() fixes them up, so it is safe to swap their state now, instead * of waiting for them to get fully dequeued. */ if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) return 0; if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) return 0; /* * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they * can suffer spurious wakeups. */ if (state & TASK_FREEZABLE) WARN_ON_ONCE(!(state & TASK_NORMAL)); #ifdef CONFIG_LOCKDEP /* * It's dangerous to freeze with locks held; there be dragons there. */ if (!(state & __TASK_FREEZABLE_UNSAFE)) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } static bool __freeze_task(struct task_struct *p) { /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ return task_call_func(p, __set_task_frozen, NULL); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to * * If @p is freezing, the freeze request is sent either by sending a fake * signal (if it's not a kernel thread) or waking it up (if it's a kernel * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise */ bool freeze_task(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } /* * Restore the saved_state before the task entered freezer. For typical task * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state * is restored unless they got an expected wakeup (see ttwu_state_match()). * Returns 1 if the task state was restored. */ static int __restore_freezer_state(struct task_struct *p, void *arg) { unsigned int state = p->saved_state; if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); p->saved_state = TASK_RUNNING; return 1; } return 0; } void __thaw_task(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (WARN_ON_ONCE(freezing(p))) goto unlock; if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) goto unlock; wake_up_state(p, TASK_FROZEN); unlock: spin_unlock_irqrestore(&freezer_lock, flags); } /** * set_freezable - make %current freezable * * Mark %current freezable and enter refrigerator if necessary. */ bool set_freezable(void) { might_sleep(); /* * Modify flags while holding freezer_lock. This ensures the * freezer notices that we aren't frozen yet or the freezing * condition is visible to try_to_freeze() below. */ spin_lock_irq(&freezer_lock); current->flags &= ~PF_NOFREEZE; spin_unlock_irq(&freezer_lock); return try_to_freeze(); } EXPORT_SYMBOL(set_freezable);
3 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include "netlink.h" #include "common.h" #include "bitset.h" struct tsinfo_req_info { struct ethnl_req_info base; }; struct tsinfo_reply_data { struct ethnl_reply_data base; struct kernel_ethtool_ts_info ts_info; struct ethtool_ts_stats stats; }; #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) const struct nla_policy ethnl_tsinfo_get_policy[] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); return ret; } static int tsinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int len = 0; int ret; BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); if (ts_info->so_timestamping) { ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TIMESTAMPING */ } if (ts_info->tx_types) { ret = ethnl_bitset32_size(&ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TX_TYPES */ } if (ts_info->rx_filters) { ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_RX_FILTERS */ } if (ts_info->phc_index >= 0) len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_uint(skb, attrtype, val)) return -EMSGSIZE; return 0; } static int tsinfo_put_stats(struct sk_buff *skb, const struct ethtool_ts_stats *stats) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); if (!nest) return -EMSGSIZE; if (tsinfo_put_stat(skb, stats->tx_stats.pkts, ETHTOOL_A_TS_STAT_TX_PKTS) || tsinfo_put_stat(skb, stats->tx_stats.lost, ETHTOOL_A_TS_STAT_TX_LOST) || tsinfo_put_stat(skb, stats->tx_stats.err, ETHTOOL_A_TS_STAT_TX_ERR)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int ret; if (ts_info->so_timestamping) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING, &ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; } if (ts_info->tx_types) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES, &ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (ts_info->rx_filters) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS, &ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (ts_info->phc_index >= 0 && nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; return 0; } const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_TSINFO_HEADER, .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, };
3 2 1 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 13 2 2 7 8 8 2 2 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling * * Portions Copyright (C) 2020 Christoph Hellwig */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> #include <linux/blktrace_api.h> #include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" static struct kobject *block_depr; /* * Unique, monotonically increasing sequential number associated with block * devices instances (i.e. incremented each time a device is attached). * Associating uevents with block devices in userspace is difficult and racy: * the uevent netlink socket is lossy, and on slow and overloaded systems has * a very high latency. * Block devices do not have exclusive owners in userspace, any process can set * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 * can be reused again and again). * A userspace process setting up a block device and watching for its events * cannot thus reliably tell whether an event relates to the device it just set * up or another earlier instance with the same name. * This sequential number allows userspace processes to solve this problem, and * uniquely associate an uevent to the lifetime to a device. */ static atomic64_t diskseq; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. */ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); /* * Only print a message and send a uevent if the gendisk is user visible * and alive. This avoids spamming the log and udev when setting the * initial capacity during probing. */ if (size == capacity || !disk_live(disk) || (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* * Historically we did not send a uevent for changes to/from an empty * device. */ if (!capacity || !size) return false; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; } stat->io_ticks += ptr->io_ticks; } } unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; for_each_possible_cpu(cpu) { inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight < 0) inflight = 0; return inflight; } static void part_in_flight_rw(struct block_device *part, unsigned int inflight[2]) { int cpu; inflight[0] = 0; inflight[1] = 0; for_each_possible_cpu(cpu) { inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight[0] < 0) inflight[0] = 0; if ((int)inflight[1] < 0) inflight[1] = 0; } /* * Can be deleted altogether. Later. * */ #define BLKDEV_MAJOR_HASH_SIZE 255 static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void (*probe)(dev_t devt); #endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string * @probe: pre-devtmpfs / pre-udev callback used to create disks when their * pre-created device node is accessed. When a probe call uses * add_disk() and it fails the driver must cleanup resources. This * interface may soon be removed. * * The @name must be unique within the system. * * The return value depends on the @major input parameter: * * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1] * then the function returns zero on success, or a negative error code * - if any unused major number was requested with @major = 0 parameter * then the return value is the allocated major number in range * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. * * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { if (major_names[index] == NULL) break; } if (index == 0) { printk("%s: failed to get major for %s\n", __func__, name); ret = -EBUSY; goto out; } major = index; ret = major; } if (major >= BLKDEV_MAJOR_MAX) { pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; } p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); if (p == NULL) { ret = -ENOMEM; goto out; } p->major = major; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; } if (!*n) *n = p; else ret = -EBUSY; spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", major, name); kfree(p); } out: mutex_unlock(&major_names_lock); return ret; } EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { struct blk_major_name **n; struct blk_major_name *p = NULL; int index = major_to_index(major); mutex_lock(&major_names_lock); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; if (!*n || strcmp((*n)->name, name)) { WARN_ON(1); } else { p = *n; *n = p->next; } spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); if (idx == -ENOSPC) return -EBUSY; return idx; } void blk_free_ext_minor(unsigned int minor) { ida_free(&ext_devt_ida, minor); } void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; unsigned long idx; rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, part) { if (bdev_is_partition(part) && !bdev_nr_sectors(part)) continue; if (!kobject_get_unless_zero(&part->bd_device.kobj)) continue; rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); put_device(&part->bd_device); rcu_read_lock(); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct file *file; int ret = 0; if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; /* * If the device is opened exclusively by current thread already, it's * safe to scan partitons, otherwise, use bd_prepare_to_claim() to * synchronize with other exclusive openers and other partition * scanners. */ if (!(mode & BLK_OPEN_EXCL)) { ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, NULL); if (IS_ERR(file)) ret = PTR_ERR(file); else fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, * and this will cause that re-assemble partitioned raid device will * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } /** * add_disk_fwnode - add disk information to kernel list with fwnode * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * @fwnode: attached disk fwnode * * This function registers the partitioning information in @disk * with the kernel. Also attach a fwnode to the disk device. */ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); int ret; /* Only makes sense for bio-based to set ->poll_bio */ if (queue_is_mq(disk->queue) && disk->fops->poll_bio) return -EINVAL; /* * The disk queue should now be all set with enough information about * the device for the elevator code to pick an adequate default * elevator if one is needed, that is, for devices requesting queue * registration. */ elevator_init_mq(disk->queue); /* Mark bdev as having a submit_bio, if needed */ if (disk->fops->submit_bio) bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); /* * If the driver provides an explicit major number it also must provide * the number of minors numbers supported, and those will be used to * setup the gendisk. * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) goto out_exit_elevator; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) goto out_exit_elevator; ret = blk_alloc_ext_minor(); if (ret < 0) goto out_exit_elevator; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); if (fwnode) device_set_node(ddev, fwnode); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) goto out_free_ext_minor; ret = disk_alloc_events(disk); if (ret) goto out_device_del; ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); if (ret) goto out_device_del; /* * avoid probable deadlock caused by allocating memory with * GFP_KERNEL in runtime_resume callback of its all ancestor * devices */ pm_runtime_set_memalloc_noio(ddev, true); disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { ret = -ENOMEM; goto out_del_block_link; } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (!disk->slave_dir) { ret = -ENOMEM; goto out_put_holder_dir; } ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); ret = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not * registered, it needs to have a valid bd_dev so that the * freeing of the dynamic major works. */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: if (!(disk->flags & GENHD_FL_HIDDEN)) bdi_unregister(disk->bdi); out_unregister_queue: blk_unregister_queue(disk); rq_qos_exit(disk->queue); out_put_slave_dir: kobject_put(disk->slave_dir); disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); pm_runtime_set_memalloc_noio(ddev, false); out_device_del: device_del(ddev); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: if (disk->queue->elevator) elevator_exit(disk->queue); return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * * This function registers the partitioning information in @disk * with the kernel. */ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { return add_disk_fwnode(parent, disk, groups, NULL); } EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; /* * On surprise disk removal, bdev_mark_dead() may call into file * systems below. Make it clear that we're expecting to not hold * disk->open_mutex. */ lockdep_assert_not_held(&disk->open_mutex); rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, bdev) { if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) continue; rcu_read_unlock(); bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); } rcu_read_unlock(); } static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ set_capacity(disk, 0); /* * Prevent new I/O from crossing bio_queue_enter(). */ return blk_queue_start_drain(disk->queue); } /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead * * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O * to this disk. */ void blk_mark_disk_dead(struct gendisk *disk) { __blk_mark_disk_dead(disk); blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove * * Removes the gendisk and all its associated resources. This deletes the * partitions associated with the gendisk, and unregisters the associated * request_queue. * * This is the counter to the respective __device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * __device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. * * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; bool start_drain, queue_dying; might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; disk_del_events(disk); /* * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) bdev_unhash(part); mutex_unlock(&disk->open_mutex); /* * Tell the file system to write back all dirty data and shut down if * it hasn't been notified earlier. */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); start_drain = __blk_mark_disk_dead(disk); queue_dying = blk_queue_dying(q); if (start_drain) blk_freeze_acquire_lock(q, true, queue_dying); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ bdi_unregister(disk->bdi); } blk_unregister_queue(disk); kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); blk_mq_freeze_queue_wait(q); blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); blk_mq_quiesce_queue(q); if (q->elevator) { mutex_lock(&q->sysfs_lock); elevator_exit(q); mutex_unlock(&q->sysfs_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) __blk_mq_unfreeze_queue(q, true); else if (queue_is_mq(q)) blk_mq_exit_queue(q); if (start_drain) blk_unfreeze_release_lock(q, true, queue_dying); } EXPORT_SYMBOL(del_gendisk); /** * invalidate_disk - invalidate the disk * @disk: the struct gendisk to invalidate * * A helper to invalidates the disk. It will clean the disk's associated * buffer/page caches and reset its internal states so that the disk * can be reused by the drivers. * * Context: can sleep */ void invalidate_disk(struct gendisk *disk) { struct block_device *bdev = disk->part0; invalidate_bdev(bdev); bdev->bd_mapping->wb_err = 0; set_capacity(disk, 0); } EXPORT_SYMBOL(invalidate_disk); /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, char *page) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } static ssize_t disk_badblocks_store(struct device *dev, struct device_attribute *attr, const char *page, size_t len) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return -ENXIO; return badblocks_store(disk->bb, page, len, 0); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; mutex_lock(&major_names_lock); for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); return; } } mutex_unlock(&major_names_lock); if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { loff_t skip = *pos; struct class_dev_iter *iter; struct device *dev; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); seqf->private = iter; class_dev_iter_init(iter, &block_class, NULL, &disk_type); do { dev = class_dev_iter_next(iter); if (!dev) return NULL; } while (skip--); return dev_to_disk(dev); } static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) { struct device *dev; (*pos)++; dev = class_dev_iter_next(seqf->private); if (dev) return dev_to_disk(dev); return NULL; } static void disk_seqf_stop(struct seq_file *seqf, void *v) { struct class_dev_iter *iter = seqf->private; /* stop is called even after start failed :-( */ if (iter) { class_dev_iter_exit(iter); kfree(iter); seqf->private = NULL; } } static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct block_device *part; unsigned long idx; if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; } static const struct seq_operations partitions_op = { .start = show_partition_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = show_partition }; #endif static int __init genhd_device_init(void) { int error; error = class_register(&block_class); if (unlikely(error)) return error; blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); /* create top-level block dir */ block_depr = kobject_create_and_add("block", NULL); return 0; } subsys_initcall(genhd_device_init); static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } static ssize_t disk_hidden_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct disk_stats stat; unsigned int inflight; inflight = part_in_flight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], (unsigned long long)stat.sectors[STAT_READ], (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), stat.ios[STAT_WRITE], stat.merges[STAT_WRITE], (unsigned long long)stat.sectors[STAT_WRITE], (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) blk_mq_in_flight_rw(q, bdev, inflight); else part_in_flight_rw(bdev, inflight); return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) { if (i) bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); else bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); } return count; } static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif /* CONFIG_FAIL_MAKE_REQUEST */ #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); #endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif #ifdef CONFIG_FAIL_IO_TIMEOUT &dev_attr_fail_timeout.attr, #endif NULL }; static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, typeof(*dev), kobj); struct gendisk *disk = dev_to_disk(dev); if (a == &dev_attr_badblocks.attr && !disk->bb) return 0; return a->mode; } static struct attribute_group disk_attr_group = { .attrs = disk_attrs, .is_visible = disk_visible, }; static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif #ifdef CONFIG_BLK_DEV_INTEGRITY &blk_integrity_attr_group, #endif NULL }; /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * * This function releases all allocated resources of the gendisk. * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. * * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); might_sleep(); WARN_ON_ONCE(disk_live(disk)); blk_trace_remove(disk->queue); /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to * call blk_mq_exit_queue here. We can't do this for the more common * teardown case (yet) as the tagset can be gone by the time the disk * is released once it was added. */ if (queue_is_mq(disk->queue) && test_bit(GD_OWNS_QUEUE, &disk->state) && !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); blkcg_exit_disk(disk); bioset_exit(&disk->bio_split); disk_release_events(disk); kfree(disk->random); disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; blk_put_queue(disk->queue); if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) disk->fops->free_disk(disk); bdev_drop(disk->part0); /* frees the disk */ } static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct gendisk *disk = dev_to_disk(dev); return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; static char *block_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); if (disk->fops->devnode) return disk->fops->devnode(disk, mode); return NULL; } const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS /* * aggregate disk stat collector. Uses the same stats that the sysfs * entries do, above, but makes them available through one seq_file. * * The output looks suspiciously like /proc/partitions with a bunch of * extra fields. */ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; unsigned int inflight; struct disk_stats stat; unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); */ rcu_read_lock(); xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; inflight = part_in_flight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); seq_printf(seqf, " %pg", hd); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", inflight); seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_putc(seqf, '\n'); } rcu_read_unlock(); return 0; } static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show }; static int __init proc_genhd_init(void) { proc_create_seq("diskstats", 0, NULL, &diskstats_op); proc_create_seq("partitions", 0, NULL, &partitions_op); return 0; } module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ dev_t part_devt(struct gendisk *disk, u8 partno) { struct block_device *part; dev_t devt = 0; rcu_read_lock(); part = xa_load(&disk->part_tbl, partno); if (part) devt = part->bd_dev; rcu_read_unlock(); return devt; } struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) goto out_free_disk; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) goto out_free_bioset; /* bdev_alloc() might need the queue, set before the first call */ disk->queue = q; disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; if (blkcg_init_disk(disk)) goto out_erase_part0; disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif return disk; out_erase_part0: xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; bdev_drop(disk->part0); out_free_bdi: bdi_put(disk->bdi); out_free_bioset: bioset_exit(&disk->bio_split); out_free_disk: kfree(disk); return NULL; } struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass) { struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_alloc_disk); /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. * * Note: for blk-mq disk put_disk must be called before freeing the tag_set * when handling probe errors (that is before add_disk() is called). * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void put_disk(struct gendisk *disk) { if (disk) put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; char *envp[] = { event, NULL }; if (!ro) event[8] = '0'; kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on * @read_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to * indicate whether the underlying physical device is write-protected. */ void set_disk_ro(struct gendisk *disk, bool read_only) { if (read_only) { if (test_and_set_bit(GD_READ_ONLY, &disk->state)) return; } else { if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) return; } set_disk_ro_uevent(disk, read_only); } EXPORT_SYMBOL(set_disk_ro); void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/rose.h> static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose); /* * This routine purges all of the queues of frames. */ void rose_clear_queues(struct sock *sk) { skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&rose_sk(sk)->ack_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void rose_frames_acked(struct sock *sk, unsigned short nr) { struct sk_buff *skb; struct rose_sock *rose = rose_sk(sk); /* * Remove all the ack-ed frames from the ack queue. */ if (rose->va != nr) { while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) { skb = skb_dequeue(&rose->ack_queue); kfree_skb(skb); rose->va = (rose->va + 1) % ROSE_MODULUS; } } } void rose_requeue_frames(struct sock *sk) { struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by rose_kick. This arrangement handles the possibility of an * empty output queue. */ while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) { if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int rose_validate_nr(struct sock *sk, unsigned short nr) { struct rose_sock *rose = rose_sk(sk); unsigned short vc = rose->va; while (vc != rose->vs) { if (nr == vc) return 1; vc = (vc + 1) % ROSE_MODULUS; } return nr == rose->vs; } /* * This routine is called when the packet layer internally generates a * control frame. */ void rose_write_internal(struct sock *sk, int frametype) { struct rose_sock *rose = rose_sk(sk); struct sk_buff *skb; unsigned char *dptr; unsigned char lci1, lci2; int maxfaclen = 0; int len, faclen; int reserve; reserve = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1; len = ROSE_MIN_LEN; switch (frametype) { case ROSE_CALL_REQUEST: len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; maxfaclen = 256; break; case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_RESET_REQUEST: len += 2; break; } skb = alloc_skb(reserve + len + maxfaclen, GFP_ATOMIC); if (!skb) return; /* * Space for AX.25 header and PID. */ skb_reserve(skb, reserve); dptr = skb_put(skb, len); lci1 = (rose->lci >> 8) & 0x0F; lci2 = (rose->lci >> 0) & 0xFF; switch (frametype) { case ROSE_CALL_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; faclen = rose_create_facilities(dptr, rose); skb_put(skb, faclen); dptr += faclen; break; case ROSE_CALL_ACCEPTED: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = 0x00; /* Address length */ *dptr++ = 0; /* Facilities length */ break; case ROSE_CLEAR_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = rose->cause; *dptr++ = rose->diagnostic; break; case ROSE_RESET_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_DTE_ORIGINATED; *dptr++ = 0; break; case ROSE_RR: case ROSE_RNR: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr = frametype; *dptr++ |= (rose->vr << 5) & 0xE0; break; case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_CONFIRMATION: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; break; default: printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); kfree_skb(skb); return; } rose_transmit_link(skb, rose->neighbour); } int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) { unsigned char *frame; frame = skb->data; *ns = *nr = *q = *d = *m = 0; switch (frame[2]) { case ROSE_CALL_REQUEST: case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_REQUEST: case ROSE_RESET_CONFIRMATION: return frame[2]; default: break; } if ((frame[2] & 0x1F) == ROSE_RR || (frame[2] & 0x1F) == ROSE_RNR) { *nr = (frame[2] >> 5) & 0x07; return frame[2] & 0x1F; } if ((frame[2] & 0x01) == ROSE_DATA) { *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT; *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT; *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT; *nr = (frame[2] >> 5) & 0x07; *ns = (frame[2] >> 1) & 0x07; return ROSE_DATA; } return ROSE_ILLEGAL; } static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char *pt; unsigned char l, lg, n = 0; int fac_national_digis_received = 0; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; if (*p == FAC_NATIONAL_RAND) facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; if (len < 2 + l) return -1; if (*p == FAC_NATIONAL_DEST_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); facilities->source_ndigis = 1; } } else if (*p == FAC_NATIONAL_SRC_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); facilities->dest_ndigis = 1; } } else if (*p == FAC_NATIONAL_FAIL_CALL) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); } else if (*p == FAC_NATIONAL_FAIL_ADD) { if (l < 1 + ROSE_ADDR_LEN) return -1; memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } else if (*p == FAC_NATIONAL_DIGIS) { if (l % AX25_ADDR_LEN) return -1; fac_national_digis_received = 1; facilities->source_ndigis = 0; facilities->dest_ndigis = 0; for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { if (pt[6] & AX25_HBIT) { if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); } else { if (facilities->source_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); } } } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char l, n = 0; char callsign[11]; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; /* Prevent overflows*/ if (l < 10 || l > 20) return -1; if (*p == FAC_CCITT_DEST_NSAP) { memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->source_call, callsign); } if (*p == FAC_CCITT_SRC_NSAP) { memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->dest_call, callsign); } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } int rose_parse_facilities(unsigned char *p, unsigned packet_len, struct rose_facilities_struct *facilities) { int facilities_len, len; facilities_len = *p++; if (facilities_len == 0 || (unsigned int)facilities_len > packet_len) return 0; while (facilities_len >= 3 && *p == 0x00) { facilities_len--; p++; switch (*p) { case FAC_NATIONAL: /* National */ len = rose_parse_national(p + 1, facilities, facilities_len - 1); break; case FAC_CCITT: /* CCITT */ len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); break; default: printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); len = 1; break; } if (len < 0) return 0; if (WARN_ON(len >= facilities_len)) return 0; facilities_len -= len + 1; p += len + 1; } return facilities_len == 0; } static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) { unsigned char *p = buffer + 1; char *callsign; char buf[11]; int len, nb; /* National Facilities */ if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { *p++ = 0x00; *p++ = FAC_NATIONAL; if (rose->rand != 0) { *p++ = FAC_NATIONAL_RAND; *p++ = (rose->rand >> 8) & 0xFF; *p++ = (rose->rand >> 0) & 0xFF; } /* Sent before older facilities */ if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { int maxdigi = 0; *p++ = FAC_NATIONAL_DIGIS; *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); for (nb = 0 ; nb < rose->source_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); p[6] |= AX25_HBIT; p += AX25_ADDR_LEN; } for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); p[6] &= ~AX25_HBIT; p += AX25_ADDR_LEN; } } /* For compatibility */ if (rose->source_ndigis > 0) { *p++ = FAC_NATIONAL_SRC_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } /* For compatibility */ if (rose->dest_ndigis > 0) { *p++ = FAC_NATIONAL_DEST_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } } *p++ = 0x00; *p++ = FAC_CCITT; *p++ = FAC_CCITT_DEST_NSAP; callsign = ax2asc(buf, &rose->dest_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); *p++ = FAC_CCITT_SRC_NSAP; callsign = ax2asc(buf, &rose->source_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); len = p - buffer; buffer[0] = len - 1; return len; } void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic) { struct rose_sock *rose = rose_sk(sk); rose_stop_timer(sk); rose_stop_idletimer(sk); rose_clear_queues(sk); rose->lci = 0; rose->state = ROSE_STATE_0; if (cause != -1) rose->cause = cause; if (diagnostic != -1) rose->diagnostic = diagnostic; sk->sk_state = TCP_CLOSE; sk->sk_err = reason; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" #include "rxe_task.h" static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { if (cap->max_send_wr > rxe->attr.max_qp_wr) { rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n", cap->max_send_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_send_sge > rxe->attr.max_send_sge) { rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n", cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } if (!has_srq) { if (cap->max_recv_wr > rxe->attr.max_qp_wr) { rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n", cap->max_recv_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_recv_sge > rxe->attr.max_recv_sge) { rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n", cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } if (cap->max_inline_data > rxe->max_inline_data) { rxe_dbg_dev(rxe, "invalid max inline data = %u > %d\n", cap->max_inline_data, rxe->max_inline_data); goto err1; } return 0; err1: return -EINVAL; } int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) { struct ib_qp_cap *cap = &init->cap; struct rxe_port *port; int port_num = init->port_num; switch (init->qp_type) { case IB_QPT_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: break; default: return -EOPNOTSUPP; } if (!init->recv_cq || !init->send_cq) { rxe_dbg_dev(rxe, "missing cq\n"); goto err1; } if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) goto err1; if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { rxe_dbg_dev(rxe, "invalid port = %d\n", port_num); goto err1; } port = &rxe->port; if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { rxe_dbg_dev(rxe, "GSI QP exists for port %d\n", port_num); goto err1; } } return 0; err1: return -EINVAL; } static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) { qp->resp.res_head = 0; qp->resp.res_tail = 0; qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL); if (!qp->resp.resources) return -ENOMEM; return 0; } static void free_rd_atomic_resources(struct rxe_qp *qp) { if (qp->resp.resources) { int i; for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; free_rd_atomic_resource(res); } kfree(qp->resp.resources); qp->resp.resources = NULL; } } void free_rd_atomic_resource(struct resp_res *res) { res->type = 0; } static void cleanup_rd_atomic_resources(struct rxe_qp *qp) { int i; struct resp_res *res; if (qp->resp.resources) { for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { res = &qp->resp.resources[i]; free_rd_atomic_resource(res); } } } static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init) { struct rxe_port *port; u32 qpn; qp->sq_sig_type = init->sq_sig_type; qp->attr.path_mtu = 1; qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); qpn = qp->elem.index; port = &rxe->port; switch (init->qp_type) { case IB_QPT_GSI: qp->ibqp.qp_num = 1; port->qp_gsi_index = qpn; qp->attr.port_num = init->port_num; break; default: qp->ibqp.qp_num = qpn; break; } spin_lock_init(&qp->state_lock); spin_lock_init(&qp->sq.sq_lock); spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); skb_queue_head_init(&qp->req_pkts); skb_queue_head_init(&qp->resp_pkts); atomic_set(&qp->ssn, 0); atomic_set(&qp->skb_out, 0); } static int rxe_init_sq(struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int wqe_size; int err; qp->sq.max_wr = init->cap.max_send_wr; wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), init->cap.max_inline_data); qp->sq.max_sge = wqe_size / sizeof(struct ib_sge); qp->sq.max_inline = wqe_size; wqe_size += sizeof(struct rxe_send_wqe); qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size, QUEUE_TYPE_FROM_CLIENT); if (!qp->sq.queue) { rxe_err_qp(qp, "Unable to allocate send queue\n"); err = -ENOMEM; goto err_out; } /* prepare info for caller to mmap send queue if user space qp */ err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, qp->sq.queue->buf, qp->sq.queue->buf_size, &qp->sq.queue->ip); if (err) { rxe_err_qp(qp, "do_mmap_info failed, err = %d\n", err); goto err_free; } /* return actual capabilities to caller which may be larger * than requested */ init->cap.max_send_wr = qp->sq.max_wr; init->cap.max_send_sge = qp->sq.max_sge; init->cap.max_inline_data = qp->sq.max_inline; return 0; err_free: vfree(qp->sq.queue->buf); kfree(qp->sq.queue); qp->sq.queue = NULL; err_out: return err; } static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; /* if we don't finish qp create make sure queue is valid */ skb_queue_head_init(&qp->req_pkts); err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index; /* pick a source UDP port number for this QP based on * the source QPN. this spreads traffic for different QPs * across different NIC RX queues (while using a single * flow for a given QP to maintain packet order). * the port number must be in the Dynamic Ports range * (0xc000 - 0xffff). */ qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff); err = rxe_init_sq(qp, init, udata, uresp); if (err) return err; qp->req.wqe_index = queue_get_producer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); qp->req.opcode = -1; qp->comp.opcode = -1; rxe_init_task(&qp->send_task, qp, rxe_sender); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0); timer_setup(&qp->retrans_timer, retransmit_timer, 0); } return 0; } static int rxe_init_rq(struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int wqe_size; int err; qp->rq.max_wr = init->cap.max_recv_wr; qp->rq.max_sge = init->cap.max_recv_sge; wqe_size = sizeof(struct rxe_recv_wqe) + qp->rq.max_sge*sizeof(struct ib_sge); qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, QUEUE_TYPE_FROM_CLIENT); if (!qp->rq.queue) { rxe_err_qp(qp, "Unable to allocate recv queue\n"); err = -ENOMEM; goto err_out; } /* prepare info for caller to mmap recv queue if user space qp */ err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { rxe_err_qp(qp, "do_mmap_info failed, err = %d\n", err); goto err_free; } /* return actual capabilities to caller which may be larger * than requested */ init->cap.max_recv_wr = qp->rq.max_wr; return 0; err_free: vfree(qp->rq.queue->buf); kfree(qp->rq.queue); qp->rq.queue = NULL; err_out: return err; } static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; /* if we don't finish qp create make sure queue is valid */ skb_queue_head_init(&qp->resp_pkts); if (!qp->srq) { err = rxe_init_rq(qp, init, udata, uresp); if (err) return err; } rxe_init_task(&qp->recv_task, qp, rxe_receiver); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; return 0; } /* called by the create qp verb */ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct ib_qp_init_attr *init, struct rxe_create_qp_resp __user *uresp, struct ib_pd *ibpd, struct ib_udata *udata) { int err; struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; unsigned long flags; rxe_get(pd); rxe_get(rcq); rxe_get(scq); if (srq) rxe_get(srq); qp->pd = pd; qp->rcq = rcq; qp->scq = scq; qp->srq = srq; atomic_inc(&rcq->num_wq); atomic_inc(&scq->num_wq); rxe_qp_init_misc(rxe, qp, init); err = rxe_qp_init_req(rxe, qp, init, udata, uresp); if (err) goto err1; err = rxe_qp_init_resp(rxe, qp, init, udata, uresp); if (err) goto err2; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; spin_unlock_irqrestore(&qp->state_lock, flags); return 0; err2: rxe_queue_cleanup(qp->sq.queue); qp->sq.queue = NULL; err1: atomic_dec(&rcq->num_wq); atomic_dec(&scq->num_wq); qp->pd = NULL; qp->rcq = NULL; qp->scq = NULL; qp->srq = NULL; if (srq) rxe_put(srq); rxe_put(scq); rxe_put(rcq); rxe_put(pd); return err; } /* called by the query qp verb */ int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) { init->event_handler = qp->ibqp.event_handler; init->qp_context = qp->ibqp.qp_context; init->send_cq = qp->ibqp.send_cq; init->recv_cq = qp->ibqp.recv_cq; init->srq = qp->ibqp.srq; init->cap.max_send_wr = qp->sq.max_wr; init->cap.max_send_sge = qp->sq.max_sge; init->cap.max_inline_data = qp->sq.max_inline; if (!qp->srq) { init->cap.max_recv_wr = qp->rq.max_wr; init->cap.max_recv_sge = qp->rq.max_sge; } init->sq_sig_type = qp->sq_sig_type; init->qp_type = qp->ibqp.qp_type; init->port_num = 1; return 0; } int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { if (mask & IB_QP_PORT) { if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); goto err1; } } if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; if (mask & IB_QP_ACCESS_FLAGS) { if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)) goto err1; if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP) goto err1; } if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; if (mask & IB_QP_ALT_PATH) { if (rxe_av_chk_attr(qp, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); goto err1; } if (attr->alt_timeout > 31) { rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n", attr->alt_timeout); goto err1; } } if (mask & IB_QP_PATH_MTU) { struct rxe_port *port = &rxe->port; enum ib_mtu max_mtu = port->attr.max_mtu; enum ib_mtu mtu = attr->path_mtu; if (mtu > max_mtu) { rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n", ib_mtu_enum_to_int(mtu), ib_mtu_enum_to_int(max_mtu)); goto err1; } } if (mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n", attr->max_rd_atomic, rxe->attr.max_qp_rd_atom); goto err1; } } if (mask & IB_QP_TIMEOUT) { if (attr->timeout > 31) { rxe_dbg_qp(qp, "invalid timeout %d > 31\n", attr->timeout); goto err1; } } return 0; err1: return -EINVAL; } /* move the qp to the reset state */ static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ rxe_disable_task(&qp->recv_task); rxe_disable_task(&qp->send_task); /* drain work and packet queuesc */ rxe_sender(qp); rxe_receiver(qp); if (qp->rq.queue) rxe_queue_reset(qp->rq.queue); if (qp->sq.queue) rxe_queue_reset(qp->sq.queue); /* cleanup attributes */ atomic_set(&qp->ssn, 0); qp->req.opcode = -1; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; qp->req.noack_pkts = 0; qp->resp.msn = 0; qp->resp.opcode = -1; qp->resp.drop_msg = 0; qp->resp.goto_error = 0; qp->resp.sent_psn_nak = 0; if (qp->resp.mr) { rxe_put(qp->resp.mr); qp->resp.mr = NULL; } cleanup_rd_atomic_resources(qp); /* reenable tasks */ rxe_enable_task(&qp->recv_task); rxe_enable_task(&qp->send_task); } /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->recv_task); rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } /* caller should hold qp->state_lock */ static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { enum ib_qp_state cur_state; enum ib_qp_state new_state; cur_state = (mask & IB_QP_CUR_STATE) ? attr->cur_qp_state : qp->attr.qp_state; new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) return -EINVAL; if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { if (qp->attr.sq_draining && new_state != IB_QPS_ERR) return -EINVAL; } return 0; } static const char *const qps2str[] = { [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", [IB_QPS_ERR] = "ERR", }; /* called by the modify qp verb */ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { int err; if (mask & IB_QP_CUR_STATE) qp->attr.cur_qp_state = attr->qp_state; if (mask & IB_QP_STATE) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); err = __qp_chk_state(qp, attr, mask); if (!err) { qp->attr.qp_state = attr->qp_state; rxe_dbg_qp(qp, "state -> %s\n", qps2str[attr->qp_state]); } spin_unlock_irqrestore(&qp->state_lock, flags); if (err) return err; switch (attr->qp_state) { case IB_QPS_RESET: rxe_qp_reset(qp); break; case IB_QPS_SQD: rxe_qp_sqd(qp, attr, mask); break; case IB_QPS_ERR: rxe_qp_error(qp); break; default: break; } } if (mask & IB_QP_MAX_QP_RD_ATOMIC) { int max_rd_atomic = attr->max_rd_atomic ? roundup_pow_of_two(attr->max_rd_atomic) : 0; qp->attr.max_rd_atomic = max_rd_atomic; atomic_set(&qp->req.rd_atomic, max_rd_atomic); } if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { int max_dest_rd_atomic = attr->max_dest_rd_atomic ? roundup_pow_of_two(attr->max_dest_rd_atomic) : 0; qp->attr.max_dest_rd_atomic = max_dest_rd_atomic; free_rd_atomic_resources(qp); err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic); if (err) return err; } if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; if (mask & IB_QP_ACCESS_FLAGS) qp->attr.qp_access_flags = attr->qp_access_flags; if (mask & IB_QP_PKEY_INDEX) qp->attr.pkey_index = attr->pkey_index; if (mask & IB_QP_PORT) qp->attr.port_num = attr->port_num; if (mask & IB_QP_QKEY) qp->attr.qkey = attr->qkey; if (mask & IB_QP_AV) rxe_init_av(&attr->ah_attr, &qp->pri_av); if (mask & IB_QP_ALT_PATH) { rxe_init_av(&attr->alt_ah_attr, &qp->alt_av); qp->attr.alt_port_num = attr->alt_port_num; qp->attr.alt_pkey_index = attr->alt_pkey_index; qp->attr.alt_timeout = attr->alt_timeout; } if (mask & IB_QP_PATH_MTU) { qp->attr.path_mtu = attr->path_mtu; qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); } if (mask & IB_QP_TIMEOUT) { qp->attr.timeout = attr->timeout; if (attr->timeout == 0) { qp->qp_timeout_jiffies = 0; } else { /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ int j = nsecs_to_jiffies(4096ULL << attr->timeout); qp->qp_timeout_jiffies = j ? j : 1; } } if (mask & IB_QP_RETRY_CNT) { qp->attr.retry_cnt = attr->retry_cnt; qp->comp.retry_cnt = attr->retry_cnt; rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt); } if (mask & IB_QP_RNR_RETRY) { qp->attr.rnr_retry = attr->rnr_retry; qp->comp.rnr_retry = attr->rnr_retry; rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry); } if (mask & IB_QP_RQ_PSN) { qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); qp->resp.psn = qp->attr.rq_psn; rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn); } if (mask & IB_QP_MIN_RNR_TIMER) { qp->attr.min_rnr_timer = attr->min_rnr_timer; rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n", attr->min_rnr_timer); } if (mask & IB_QP_SQ_PSN) { qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); qp->req.psn = qp->attr.sq_psn; qp->comp.psn = qp->attr.sq_psn; rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn); } if (mask & IB_QP_PATH_MIG_STATE) qp->attr.path_mig_state = attr->path_mig_state; if (mask & IB_QP_DEST_QPN) qp->attr.dest_qp_num = attr->dest_qp_num; return 0; } /* called by the query qp verb */ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { unsigned long flags; *attr = qp->attr; attr->rq_psn = qp->resp.psn; attr->sq_psn = qp->req.psn; attr->cap.max_send_wr = qp->sq.max_wr; attr->cap.max_send_sge = qp->sq.max_sge; attr->cap.max_inline_data = qp->sq.max_inline; if (!qp->srq) { attr->cap.max_recv_wr = qp->rq.max_wr; attr->cap.max_recv_sge = qp->rq.max_sge; } rxe_av_to_attr(&qp->pri_av, &attr->ah_attr); rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr); /* Applications that get this state typically spin on it. * Yield the processor */ spin_lock_irqsave(&qp->state_lock, flags); attr->cur_qp_state = qp_state(qp); if (qp->attr.sq_draining) { spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); } else { spin_unlock_irqrestore(&qp->state_lock, flags); } return 0; } int rxe_qp_chk_destroy(struct rxe_qp *qp) { /* See IBA o10-2.2.3 * An attempt to destroy a QP while attached to a mcast group * will fail immediately. */ if (atomic_read(&qp->mcg_num)) { rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n"); return -EBUSY; } return 0; } /* called when the last reference to the qp is dropped */ static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->valid = 0; spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; if (qp_type(qp) == IB_QPT_RC) { del_timer_sync(&qp->retrans_timer); del_timer_sync(&qp->rnr_nak_timer); } if (qp->recv_task.func) rxe_cleanup_task(&qp->recv_task); if (qp->send_task.func) rxe_cleanup_task(&qp->send_task); /* flush out any receive wr's or pending requests */ rxe_sender(qp); rxe_receiver(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); if (qp->srq) rxe_put(qp->srq); if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); if (qp->scq) { atomic_dec(&qp->scq->num_wq); rxe_put(qp->scq); } if (qp->rcq) { atomic_dec(&qp->rcq->num_wq); rxe_put(qp->rcq); } if (qp->pd) rxe_put(qp->pd); if (qp->resp.mr) rxe_put(qp->resp.mr); free_rd_atomic_resources(qp); if (qp->sk) { if (qp_type(qp) == IB_QPT_RC) sk_dst_reset(qp->sk->sk); kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } } /* called when the last reference to the qp is dropped */ void rxe_qp_cleanup(struct rxe_pool_elem *elem) { struct rxe_qp *qp = container_of(elem, typeof(*qp), elem); execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); }
127 42 1637 737 863 266 243 2 16 1 51 22 27 35 1640 1647 1654 1616 62 5 5 174 174 143 2541 1083 2618 2521 2541 2517 1068 1083 1076 242 984 487 1082 773 1888 1637 1543 117 1913 1570 175 74 1643 1899 1913 1650 1577 148 1 246 249 242 26 1577 244 1582 1655 1544 555 1591 230 1 883 146 192 193 731 732 1563 1350 782 773 1555 633 638 606 55 31 628 624 141 631 623 629 597 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar */ #include <linux/kernel_stat.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pid_namespace.h> #include <linux/notifier.h> #include <linux/thread_info.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> #include "tick-internal.h" #include "timer_migration.h" #define CREATE_TRACE_POINTS #include <trace/events/timer.h> __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading * the timers into the lower array levels. The previous 'classic' timer wheel * implementation of the kernel already violated the 'exact' expiry by adding * slack to the expiry time to provide batched expiration. The granularity * levels provide implicit batching. * * This is an optimization of the original timer wheel implementation for the * majority of the timer wheel use cases: timeouts. The vast majority of * timeout timers (networking, disk I/O ...) are canceled before expiry. If * the timeout expires it indicates that normal operation is disturbed, so it * does not matter much whether the timeout comes with a slight delay. * * The only exception to this are networking timers with a small expiry * time. They rely on the granularity. Those fit into the first wheel level, * which has HZ granularity. * * We don't have cascading anymore. timers with a expiry time above the * capacity of the last wheel level are force expired at the maximum timeout * value of the last wheel level. From data sampling we know that the maximum * value observed is 5 days (network connection tracking), so this should not * be an issue. * * The currently chosen array constants values are a good compromise between * array size and granularity. * * This results in the following granularity and range levels: * * HZ 1000 steps * Level Offset Granularity Range * 0 0 1 ms 0 ms - 63 ms * 1 64 8 ms 64 ms - 511 ms * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) * * HZ 300 * Level Offset Granularity Range * 0 0 3 ms 0 ms - 210 ms * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) * * HZ 250 * Level Offset Granularity Range * 0 0 4 ms 0 ms - 255 ms * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) * * HZ 100 * Level Offset Granularity Range * 0 0 10 ms 0 ms - 630 ms * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ /* Clock divisor for the next level */ #define LVL_CLK_SHIFT 3 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) #define LVL_CLK_MASK (LVL_CLK_DIV - 1) #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) /* * The time start value for each level to select the bucket at enqueue * time. We start from the last possible delta of the previous level * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) /* Size of each clock level */ #define LVL_BITS 6 #define LVL_SIZE (1UL << LVL_BITS) #define LVL_MASK (LVL_SIZE - 1) #define LVL_OFFS(n) ((n) * LVL_SIZE) /* Level depth */ #if HZ > 100 # define LVL_DEPTH 9 # else # define LVL_DEPTH 8 #endif /* The cutoff (max. capacity of the wheel) */ #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) /* * The resulting wheel size. If NOHZ is configured we allocate two * wheels so we have a separate storage for the deferrable timers. */ #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) #ifdef CONFIG_NO_HZ_COMMON /* * If multiple bases need to be locked, use the base ordering for lock * nesting, i.e. lowest number first. */ # define NR_BASES 3 # define BASE_LOCAL 0 # define BASE_GLOBAL 1 # define BASE_DEF 2 #else # define NR_BASES 1 # define BASE_LOCAL 0 # define BASE_GLOBAL 0 # define BASE_DEF 0 #endif /** * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around * timer expiry callback execution and when trying to * delete a running timer and it wasn't successful in * the first glance. It prevents priority inversion * when callback was preempted on a remote CPU and a * caller tries to delete the running timer. It also * prevents a life lock, when the task which tries to * delete a timer preempted the softirq thread which * is running the timer callback function. * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter * waiting for the end of the timer callback function * execution. * @clk: clock of the timer base; is updated before enqueue * of a timer; during expiry, it is 1 offset ahead of * jiffies to avoid endless requeuing to current * jiffies * @next_expiry: expiry value of the first timer; it is updated when * finding the next timer and during enqueue; the * value is not valid, when next_expiry_recalc is set * @cpu: Number of CPU the timer base belongs to * @next_expiry_recalc: States, whether a recalculation of next_expiry is * required. Value is set true, when a timer was * deleted. * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ * code. This state is only used in standard * base. Deferrable timers, which are enqueued remotely * never wake up an idle CPU. So no matter of supporting it * for this base. * @timers_pending: Is set, when a timer is pending in the base. It is only * reliable when next_expiry_recalc is not set. * @pending_map: bitmap of the timer wheel; each bit reflects a * bucket of the wheel. When a bit is set, at least a * single timer is enqueued in the related bucket. * @vectors: Array of lists; Each array member reflects a bucket * of the timer wheel. The list contains all timers * which are enqueued into a specific bucket. */ struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; #ifdef CONFIG_PREEMPT_RT spinlock_t expiry_lock; atomic_t timer_waiters; #endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; bool next_expiry_recalc; bool is_idle; bool timers_pending; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #ifdef CONFIG_NO_HZ_COMMON static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); static DEFINE_MUTEX(timer_keys_mutex); static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { if (sysctl_timer_migration && tick_nohz_active) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); } #ifdef CONFIG_SYSCTL static int timer_migration_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(); mutex_unlock(&timer_keys_mutex); return ret; } static struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init timer_sysctl_init(void) { register_sysctl("kernel", timer_sysctl); return 0; } device_initcall(timer_sysctl_init); #endif /* CONFIG_SYSCTL */ #else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ static void timer_update_keys(struct work_struct *work) { mutex_lock(&timer_keys_mutex); timers_update_migration(); static_branch_enable(&timers_nohz_active); mutex_unlock(&timer_keys_mutex); } void timers_update_nohz(void) { schedule_work(&timer_update_work); } static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); } #else static inline bool is_timers_nohz_active(void) { return false; } #endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { int rem; unsigned long original = j; /* * We don't want all cpus firing their timers at once hitting the * same lock or cachelines, so we skew each extra cpu with an extra * 3 jiffies. This 3 jiffies came originally from the mm/ code which * already did this. * The skew is done by adding 3*cpunr, then round, then subtract this * extra offset again. */ j += cpu * 3; rem = j % HZ; /* * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. * But never round down if @force_up is set. */ if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; /* * Make sure j is still in the future. Otherwise return the * unmodified value. */ return time_is_after_jiffies(j) ? j : original; } /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, false); } EXPORT_SYMBOL_GPL(__round_jiffies); /** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); /** * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); /** * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { return __round_jiffies_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_relative); /** * __round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, true); } EXPORT_SYMBOL_GPL(__round_jiffies_up); /** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, true) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); /** * round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * * This is the same as round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), true); } EXPORT_SYMBOL_GPL(round_jiffies_up); /** * round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * * This is the same as round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up_relative(unsigned long j) { return __round_jiffies_up_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline unsigned int timer_get_idx(struct timer_list *timer) { return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; } static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) { timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | idx << TIMER_ARRAYSHIFT; } /* * Helper function to calculate the array index for a given expiry * time. */ static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { /* * The timer wheel has to guarantee that a timer does not fire * early. Early expiry can happen due to: * - Timer is armed at the edge of a tick * - Truncation of the expiry time in the outer wheel levels * * Round up with level granularity to prevent this. */ expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } static int calc_wheel_index(unsigned long expires, unsigned long clk, unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { /* * Deferrable timers do not prevent the CPU from entering dynticks and * are not taken into account on the idle/nohz_full path. An IPI when a * new deferrable timer is enqueued will wake up the remote CPU but * nothing will be done with the deferrable timer base. Therefore skip * the remote IPI for deferrable timers completely. */ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; /* * We might have to IPI the remote CPU if the base is idle and the * timer is pinned. If it is a non pinned timer, it is only queued * on the remote CPU, when timer was running during queueing. Then * everything is handled by remote CPU anyway. If the other CPU is * on the way to idle then it can't set base->is_idle as we hold * the base lock: */ if (base->is_idle) { WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); } } /* * Enqueue the timer into the hash bucket, mark it pending in * the bitmap, store the index in the timer flags then wake up * the target CPU if needed. */ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The * effective expiry time of the timer is required here * (bucket_expiry) instead of timer->expires. */ if (time_before(bucket_expiry, base->next_expiry)) { /* * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long bucket_expiry; unsigned int idx; idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr timer_debug_descr; struct timer_hint { void (*function)(struct timer_list *t); long offset; }; #define TIMER_HINT(fn, container, timr, hintfn) \ { \ .function = fn, \ .offset = offsetof(container, hintfn) - \ offsetof(container, timr) \ } static const struct timer_hint timer_hints[] = { TIMER_HINT(delayed_work_timer_fn, struct delayed_work, timer, work.func), TIMER_HINT(kthread_delayed_work_timer_fn, struct kthread_delayed_work, timer, work.func), }; static void *timer_debug_hint(void *addr) { struct timer_list *timer = addr; int i; for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { if (timer_hints[i].function == timer->function) { void (**fn)(void) = addr + timer_hints[i].offset; return *fn; } } return timer->function; } static bool timer_is_static_object(void *addr) { struct timer_list *timer = addr; return (timer->entry.pprev == NULL && timer->entry.next == TIMER_ENTRY_STATIC); } /* * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: return false; } } /* Stub timer callback for improperly used timers. */ static void stub_timer(struct timer_list *unused) { WARN_ON(1); } /* * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: return false; } } /* * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; default: return false; } } static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) { debug_object_init(timer, &timer_debug_descr); } static inline void debug_timer_activate(struct timer_list *timer) { debug_object_activate(timer, &timer_debug_descr); } static inline void debug_timer_deactivate(struct timer_list *timer) { debug_object_deactivate(timer, &timer_debug_descr); } static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); void destroy_timer_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_timer_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) { debug_timer_init(timer); trace_timer_init(timer); } static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); trace_timer_cancel(timer); } static inline void debug_assert_init(struct timer_list *timer) { debug_timer_assert_init(timer); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; timer->function = func; if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } /** * init_timer_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct hlist_node *entry = &timer->entry; debug_deactivate(timer); __hlist_del(entry); if (clear_pending) entry->pprev = NULL; entry->next = LIST_POISON2; } static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { unsigned idx = timer_get_idx(timer); if (!timer_pending(timer)) return 0; if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); base->next_expiry_recalc = true; } detach_timer(timer, clear_pending); return 1; } static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; struct timer_base *base; base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; struct timer_base *base; base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } static inline struct timer_base *get_timer_base(u32 tflags) { return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } static inline void __forward_timer_base(struct timer_base *base, unsigned long basej) { /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ if (time_after(base->next_expiry, basej)) { base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } } static inline void forward_timer_base(struct timer_base *base) { __forward_timer_base(base, READ_ONCE(jiffies)); } /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found in the base->vectors array. * * When a timer is migrating then the TIMER_MIGRATING flag is set and we need * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { struct timer_base *base; u32 tf; /* * We need to use READ_ONCE() here, otherwise the compiler * might re-read @tf between the check for TIMER_MIGRATING * and spin_lock(). */ tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } } #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 #define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; int ret = 0; debug_assert_init(timer); /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new * timer with this expiry. */ long diff = timer->expires - expires; if (!diff) return 1; if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* * We lock timer base and calculate the bucket index right * here. If the timer ends up in the same bucket, then we * just update the expiry time and avoid the whole * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && time_before_eq(timer->expires, expires)) { ret = 1; goto out_unlock; } clk = base->clk; idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending * timer. If it matches set the expiry to the new value so a * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { if (!(options & MOD_TIMER_REDUCE)) timer->expires = expires; else if (time_after(timer->expires, expires)) timer->expires = expires; ret = 1; goto out_unlock; } } else { base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); } ret = detach_if_pending(timer, base, false); if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; new_base = get_timer_this_cpu_base(timer->flags); if (base != new_base) { /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); forward_timer_base(base); } } debug_timer_activate(timer); timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only * enqueue_timer() is required. Otherwise we need to (re)calculate * the wheel index via internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) enqueue_timer(base, timer, idx, bucket_expiry); else internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * mod_timer_pending - Modify a pending timer's timeout * @timer: The pending timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer_pending() is the same for pending timers as mod_timer(), but * will not activate inactive timers. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and not modified or was in * shutdown state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - Modify a timer's timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In * case that the timer is inactive, the del_timer() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * * If @timer->function == NULL then the start operation is silently * discarded. In this case the return value is 0 and meaningless. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires did * not change the effective expiry time */ int mod_timer(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only * modify an enqueued timer if that would reduce the expiration time. If * @timer is not enqueued it starts the timer. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires * did not change the effective expiry time such that the * timer would expire earlier than already scheduled */ int timer_reduce(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_REDUCE); } EXPORT_SYMBOL(timer_reduce); /** * add_timer - Start a timer * @timer: The timer to be started * * Start @timer to expire at @timer->expires in the future. @timer->expires * is the absolute expiry time measured in 'jiffies'. When the timer expires * timer->function(timer) will be invoked from soft interrupt context. * * The @timer->expires and @timer->function fields must be set prior * to calling this function. * * If @timer->function == NULL then the start operation is silently * discarded. * * If @timer->expires is already in the past @timer will be queued to * expire at the next timer tick. * * This can only operate on an inactive timer. Attempts to invoke this on * an active timer are rejected with a warning. */ void add_timer(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); /** * add_timer_local() - Start a timer on the local CPU * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is set. * * See add_timer() for further details. */ void add_timer_local(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags |= TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_local); /** * add_timer_global() - Start a timer without TIMER_PINNED flag set * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is unset. * * See add_timer() for further details. */ void add_timer_global(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags &= ~TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_global); /** * add_timer_on - Start a timer on a particular CPU * @timer: The timer to be started * @cpu: The CPU to start it on * * Same as add_timer() except that it starts the timer on the given CPU and * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in * the next round, add_timer_global() should be used instead as it unsets * the TIMER_PINNED flag. * * See add_timer() for further details. */ void add_timer_on(struct timer_list *timer, int cpu) { struct timer_base *new_base, *base; unsigned long flags; debug_assert_init(timer); if (WARN_ON_ONCE(timer_pending(timer))) return; /* Make sure timer flags have TIMER_PINNED flag set */ timer->flags |= TIMER_PINNED; new_base = get_timer_cpu_base(timer->flags, cpu); /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the * wrong base locked. See lock_timer_base(). */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated while * holding base lock to prevent a race against the shutdown code. */ if (!timer->function) goto out_unlock; if (base != new_base) { timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } forward_timer_base(base); debug_timer_activate(timer); internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); /** * __timer_delete - Internal function: Deactivate a timer * @timer: The timer to be deactivated * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the time. In that * case any attempt to rearm @timer after this function returns will be * silently ignored. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = 0; debug_assert_init(timer); /* * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * * If timer->function is currently executed, then this makes sure * that the callback cannot requeue the timer. */ if (timer_pending(timer) || shutdown) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; } /** * timer_delete - Deactivate a timer * @timer: The timer to be deactivated * * The function only deactivates a pending timer, but contrary to * timer_delete_sync() it does not take into account whether the timer's * callback function is concurrently executed on a different CPU or not. * It neither prevents rearming of the timer. If @timer can be rearmed * concurrently then the return value of this function is meaningless. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete(struct timer_list *timer) { return __timer_delete(timer, false); } EXPORT_SYMBOL(timer_delete); /** * timer_shutdown - Deactivate a timer and prevent rearming * @timer: The timer to be deactivated * * The function does not wait for an eventually running timer callback on a * different CPU but it prevents rearming of the timer. Any attempt to arm * @timer after this function returns will be silently ignored. * * This function is useful for teardown code and should only be used when * timer_shutdown_sync() cannot be invoked due to locking or context constraints. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown(struct timer_list *timer) { return __timer_delete(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown); /** * __try_to_del_timer_sync - Internal function: Try to deactivate a timer * @timer: Timer to deactivate * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the timer. Any * attempt to rearm @timer after this function returns will be silently * ignored. * * This function cannot guarantee that the timer cannot be rearmed * right after dropping the base lock if @shutdown is false. That * needs to be prevented by the calling code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = -1; debug_assert_init(timer); base = lock_timer_base(timer, &flags); if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not * queued and the timer callback function is not running on any CPU. * * This function does not guarantee that the timer cannot be rearmed right * after dropping the base lock. That needs to be prevented by the calling * code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ int try_to_del_timer_sync(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { spin_lock_init(&base->expiry_lock); } static inline void timer_base_lock_expiry(struct timer_base *base) { spin_lock(&base->expiry_lock); } static inline void timer_base_unlock_expiry(struct timer_base *base) { spin_unlock(&base->expiry_lock); } /* * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) __releases(&base->lock) __releases(&base->expiry_lock) __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion, if the softirq thread on a remote CPU * got preempted, and it prevents a life lock when the task which tries to * delete a timer preempted the softirq thread running the timer callback * function. */ static void del_timer_wait_running(struct timer_list *timer) { u32 tf; tf = READ_ONCE(timer->flags); if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* * Mark the base as contended and grab the expiry lock, * which is held by the softirq across the timer * callback. Drop the lock immediately so the softirq can * expire the next timer. In theory the timer could already * be running again, but that's more than unlikely and just * causes another wait loop. */ atomic_inc(&base->timer_waiters); spin_lock_bh(&base->expiry_lock); atomic_dec(&base->timer_waiters); spin_unlock_bh(&base->expiry_lock); } } #else static inline void timer_base_init_expiry_lock(struct timer_base *base) { } static inline void timer_base_lock_expiry(struct timer_base *base) { } static inline void timer_base_unlock_expiry(struct timer_base *base) { } static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif /** * __timer_delete_sync - Internal function: Deactivate a timer and wait * for the handler to finish. * @timer: The timer to be deactivated * @shutdown: If true, @timer->function will be set to NULL under the * timer base lock which prevents rearming of @timer * * If @shutdown is not set the timer can be rearmed later. If the timer can * be rearmed concurrently, i.e. after dropping the base lock then the * return value is meaningless. * * If @shutdown is set then @timer->function is set to NULL under timer * base lock which prevents rearming of the timer. Any attempt to rearm * a shutdown timer is silently ignored. * * If the timer should be reused after shutdown it has to be initialized * again. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete_sync(struct timer_list *timer, bool shutdown) { int ret; #ifdef CONFIG_LOCKDEP unsigned long flags; /* * If lockdep gives a backtrace here, please reference * the synchronization rules above. */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it * could lead to deadlock. */ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); /* * Must be able to sleep on PREEMPT_RT because of the slowpath in * del_timer_wait_running(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) lockdep_assert_preemption_enabled(); do { ret = __try_to_del_timer_sync(timer, shutdown); if (unlikely(ret < 0)) { del_timer_wait_running(timer); cpu_relax(); } } while (ret < 0); return ret; } /** * timer_delete_sync - Deactivate a timer and wait for the handler to finish. * @timer: The timer to be deactivated * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's callback * function. The timer's handler must not call add_timer_on(). Upon exit * the timer is not queued and the handler is not running on any CPU. * * For !irqsafe timers, the caller must not hold locks that are held in * interrupt context. Even if the lock has nothing to do with the timer in * question. Here's why:: * * CPU0 CPU1 * ---- ---- * <SOFTIRQ> * call_timer_fn(); * base->running_timer = mytimer; * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); * timer_delete_sync(mytimer); * while (base->running_timer == mytimer); * * Now timer_delete_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but it has * interrupted the softirq that CPU0 is waiting to finish. * * This function cannot guarantee that the timer is not rearmed again by * some concurrent or preempting code, right after it dropped the base * lock. If there is the possibility of a concurrent rearm then the return * value of the function is meaningless. * * If such a guarantee is needed, e.g. for teardown situations then use * timer_shutdown_sync() instead. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete_sync(struct timer_list *timer) { return __timer_delete_sync(timer, false); } EXPORT_SYMBOL(timer_delete_sync); /** * timer_shutdown_sync - Shutdown a timer and prevent rearming * @timer: The timer to be shutdown * * When the function returns it is guaranteed that: * - @timer is not queued * - The callback function of @timer is not running * - @timer cannot be enqueued again. Any attempt to rearm * @timer is silently ignored. * * See timer_delete_sync() for synchronization rules. * * This function is useful for final teardown of an infrastructure where * the timer is subject to a circular dependency problem. * * A common pattern for this is a timer and a workqueue where the timer can * schedule work and work can arm the timer. On shutdown the workqueue must * be destroyed and the timer must be prevented from rearming. Unless the * code has conditionals like 'if (mything->in_shutdown)' to prevent that * there is no way to get this correct with timer_delete_sync(). * * timer_shutdown_sync() is solving the problem. The correct ordering of * calls in this case is: * * timer_shutdown_sync(&mything->timer); * workqueue_destroy(&mything->workqueue); * * After this 'mything' can be safely freed. * * This obviously implies that the timer is not required to be functional * for the rest of the shutdown operation. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown_sync(struct timer_list *timer) { return __timer_delete_sync(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown_sync); static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), unsigned long baseclk) { int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* * It is permissible to free the timer from inside the * function that is called from it, this we need to take into * account for lockdep too. To avoid bogus "held lock freed" * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at * timer_delete_sync() by acquiring the lock_map around the fn() * call here and in timer_delete_sync(). */ lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ preempt_count_set(count); } } static void expire_timers(struct timer_base *base, struct hlist_head *head) { /* * This value is required only for tracing. base->clk was * incremented directly before expire_timers was called. But expiry * is related to the old base->clk value. */ unsigned long baseclk = base->clk - 1; while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); base->running_timer = timer; detach_timer(timer, true); fn = timer->function; if (WARN_ON_ONCE(!fn)) { /* Should never happen. Emphasis on should! */ base->running_timer = NULL; continue; } if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); base->running_timer = NULL; } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); base->running_timer = NULL; timer_sync_wait_running(base); } } } static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; for (i = 0; i < LVL_DEPTH; i++) { idx = (clk & LVL_MASK) + i * LVL_SIZE; if (__test_and_clear_bit(idx, base->pending_map)) { vec = base->vectors + idx; hlist_move_list(vec, heads++); levels++; } /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) break; /* Shift clock for the next level granularity */ clk >>= LVL_CLK_SHIFT; } return levels; } /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) { unsigned pos, start = offset + clk; unsigned end = offset + LVL_SIZE; pos = find_next_bit(base->pending_map, end, start); if (pos < end) return pos - start; pos = find_next_bit(base->pending_map, start, offset); return pos < start ? pos + LVL_SIZE - start : -1; } /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. * * Store next expiry time in base->next_expiry. */ static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; /* * If the next expiration happens before we reach * the next level, no need to check further. */ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) break; } /* * Clock for the next level. If the current level clock lower * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 * * we have to look at all levels @index 0. With * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 2 * * LVL0 has the next expiring bucket @index 2. The upper * levels have the next expiring bucket @index 1. * * In case that the propagation wraps the next level the same * rules apply: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 F 2 * * So after looking at LVL0 we get: * * LVL5 LVL4 LVL3 LVL2 LVL1 * 0 0 0 1 0 * * So no propagation from LVL1 to LVL2 because that happened * with the add already, but then we need to propagate further * from LVL2 to LVL3. * * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { u64 nextevt = hrtimer_get_next_event(); /* * If high resolution timers are enabled * hrtimer_get_next_event() returns KTIME_MAX. */ if (expires <= nextevt) return expires; /* * If the next timer is already expired, return the tick base * time so the tick is fired immediately. */ if (nextevt <= basem) return basem; /* * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. * * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) timer_recalc_next_expiry(base); /* * Move next_expiry for the empty base into the future to prevent an * unnecessary raise of the timer softirq when the next_expiry value * will be reached even if there is no timer pending. * * This update is also required to make timer_base::next_expiry values * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); return base->next_expiry; } static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, struct timer_base *base_local, struct timer_base *base_global, struct timer_events *tevt) { unsigned long nextevt, nextevt_local, nextevt_global; bool local_first; nextevt_local = next_timer_interrupt(base_local, basej); nextevt_global = next_timer_interrupt(base_global, basej); local_first = time_before_eq(nextevt_local, nextevt_global); nextevt = local_first ? nextevt_local : nextevt_global; /* * If the @nextevt is at max. one tick away, use @nextevt and store * it in the local expiry value. The next global event is irrelevant in * this case and can be left as KTIME_MAX. */ if (time_before_eq(nextevt, basej + 1)) { /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * This is required for the remote check only but it doesn't * hurt, when it is done for both call sites: * * * The remote callers will only take care of the global timers * as local timers will be handled by CPU itself. When not * updating tevt->global with the already missed first global * timer, it is possible that it will be missed completely. * * * The local callers will ignore the tevt->global anyway, when * nextevt is max. one tick away. */ if (!local_first) tevt->global = tevt->local; return nextevt; } /* * Update tevt.* values: * * If the local queue expires first, then the global event can be * ignored. If the global queue is empty, nothing to do either. */ if (!local_first && base_global->timers_pending) tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; if (base_local->timers_pending) tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; return nextevt; } # ifdef CONFIG_SMP /** * fetch_next_timer_interrupt_remote() - Store next timers into @tevt * @basej: base time jiffies * @basem: base time clock monotonic * @tevt: Pointer to the storage for the expiry values * @cpu: Remote CPU * * Stores the next pending local and global timer expiry values in the * struct pointed to by @tevt. If a queue is empty the corresponding * field is set to KTIME_MAX. If local event expires before global * event, global event is set to KTIME_MAX as well. * * Caller needs to make sure timer base locks are held (use * timer_lock_remote_bases() for this purpose). */ void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, struct timer_events *tevt, unsigned int cpu) { struct timer_base *base_local, *base_global; /* Preset local / global events */ tevt->local = tevt->global = KTIME_MAX; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_held(&base_local->lock); lockdep_assert_held(&base_global->lock); fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); } /** * timer_unlock_remote_bases - unlock timer bases of cpu * @cpu: Remote CPU * * Unlocks the remote timer bases. */ void timer_unlock_remote_bases(unsigned int cpu) __releases(timer_bases[BASE_LOCAL]->lock) __releases(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); } /** * timer_lock_remote_bases - lock timer bases of cpu * @cpu: Remote CPU * * Locks the remote timer bases. */ void timer_lock_remote_bases(unsigned int cpu) __acquires(timer_bases[BASE_LOCAL]->lock) __acquires(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_irqs_disabled(); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); } /** * timer_base_is_idle() - Return whether timer base is set idle * * Returns value of local timer base is_idle value. */ bool timer_base_is_idle(void) { return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); } static void __run_timer_base(struct timer_base *base); /** * timer_expire_remote() - expire global timers of cpu * @cpu: Remote CPU * * Expire timers of global base of remote CPU. */ void timer_expire_remote(unsigned int cpu) { struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); __run_timer_base(base); } static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { u64 next_tmigr; if (timer_base_idle) next_tmigr = tmigr_cpu_new_timer(tevt->global); else if (tick_stop_path) next_tmigr = tmigr_cpu_deactivate(tevt->global); else next_tmigr = tmigr_quick_check(tevt->global); /* * If the CPU is the last going idle in timer migration hierarchy, make * sure the CPU will wake up in time to handle remote timers. * next_tmigr == KTIME_MAX if other CPUs are still active. */ if (next_tmigr < tevt->local) { u64 tmp; /* If we missed a tick already, force 0 delta */ if (next_tmigr < basem) next_tmigr = basem; tmp = div_u64(next_tmigr - basem, TICK_NSEC); *nextevt = basej + (unsigned long)tmp; tevt->local = next_tmigr; } } # else static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { /* * Make sure first event is written into tevt->local to not miss a * timer on !SMP systems. */ tevt->local = min_t(u64, tevt->local, tevt->global); } # endif /* CONFIG_SMP */ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, bool *idle) { struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; struct timer_base *base_local, *base_global; unsigned long nextevt; bool idle_is_possible; /* * When the CPU is offline, the tick is cancelled and nothing is supposed * to try to stop it. */ if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { if (idle) *idle = true; return tevt.local; } base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); nextevt = fetch_next_timer_interrupt(basej, basem, base_local, base_global, &tevt); /* * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. * * The proper timer migration hierarchy function depends on the callsite * and whether timer base is idle or not. @nextevt will be updated when * this CPU needs to handle the first timer migration hierarchy * event. See timer_use_tmigr() for detailed information. */ idle_is_possible = time_after(nextevt, basej + 1); if (idle_is_possible) timer_use_tmigr(basej, basem, &nextevt, idle, base_local->is_idle, &tevt); /* * We have a fresh next event. Check whether we can forward the * base. */ __forward_timer_base(base_local, basej); __forward_timer_base(base_global, basej); /* * Set base->is_idle only when caller is timer_base_try_to_set_idle() */ if (idle) { /* * Bases are idle if the next event is more than a tick * away. Caution: @nextevt could have changed by enqueueing a * global timer into timer migration hierarchy. Therefore a new * check is required here. * * If the base is marked idle then any timer add operation must * forward the base clk itself to keep granularity small. This * idle logic is only maintained for the BASE_LOCAL and * BASE_GLOBAL base, deferrable timers may still see large * granularity skew (by design). */ if (!base_local->is_idle && time_after(nextevt, basej + 1)) { base_local->is_idle = true; /* * Global timers queued locally while running in a task * in nohz_full mode need a self-IPI to kick reprogramming * in IRQ tail. */ if (tick_nohz_full_cpu(base_local->cpu)) base_global->is_idle = true; trace_timer_base_idle(true, base_local->cpu); } *idle = base_local->is_idle; /* * When timer base is not set idle, undo the effect of * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be * changed here. */ if (!base_local->is_idle && idle_is_possible) tmigr_cpu_activate(); } raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); return cmp_next_hrtimer_event(basem, tevt.local); } /** * get_next_timer_interrupt() - return the time (clock mono) of the next timer * @basej: base time jiffies * @basem: base time clock monotonic * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. If timer of global base was queued into * timer migration hierarchy, first global timer is not taken into account. If * it was the last CPU of timer migration hierarchy going idle, first global * event is taken into account. */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { return __get_next_timer_interrupt(basej, basem, NULL); } /** * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases * @basej: base time jiffies * @basem: base time clock monotonic * @idle: pointer to store the value of timer_base->is_idle on return; * *idle contains the information whether tick was already stopped * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is * returned as well. */ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) { if (*idle) return KTIME_MAX; return __get_next_timer_interrupt(basej, basem, idle); } /** * timer_clear_idle - Clear the idle state of the timer base * * Called with interrupts disabled */ void timer_clear_idle(void) { /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just * make the window for sending the IPI a few instructions smaller * for the cost of taking the lock in the exit from idle * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); if (tick_nohz_full_cpu(smp_processor_id())) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); trace_timer_base_idle(false, smp_processor_id()); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); } #endif /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. */ static inline void __run_timers(struct timer_base *base) { struct hlist_head heads[LVL_DEPTH]; int levels; lockdep_assert_held(&base->lock); if (base->running_timer) return; while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* * The two possible reasons for not finding any expired * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + * NEXT_TIMER_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); /* * While executing timers, base->clk is set 1 offset ahead of * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); } } static void __run_timer_base(struct timer_base *base) { /* Can race against a remote CPU updating next_expiry under the lock */ if (time_before(jiffies, READ_ONCE(base->next_expiry))) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); __run_timers(base); raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } static void run_timer_base(int index) { struct timer_base *base = this_cpu_ptr(&timer_bases[index]); __run_timer_base(base); } /* * This function runs timers and the timer-tq in bottom half context. */ static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { run_timer_base(BASE_GLOBAL); run_timer_base(BASE_DEF); if (is_timers_nohz_active()) tmigr_handle_remote(); } } /* * Called by the local, per-CPU timer interrupt on SMP. */ static void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { /* * Raise the softirq only if required. * * timer_base::next_expiry can be written by a remote CPU while * holding the lock. If this write happens at the same time than * the lockless local read, sanity checker could complain about * data corruption. * * There are two possible situations where * timer_base::next_expiry is written by a remote CPU: * * 1. Remote CPU expires global timers of this CPU and updates * timer_base::next_expiry of BASE_GLOBAL afterwards in * next_timer_interrupt() or timer_recalc_next_expiry(). The * worst outcome is a superfluous raise of the timer softirq * when the not yet updated value is read. * * 2. A new first pinned timer is enqueued by a remote CPU * and therefore timer_base::next_expiry of BASE_LOCAL is * updated. When this update is missed, this isn't a * problem, as an IPI is executed nevertheless when the CPU * was idle before. When the CPU wasn't idle but the update * is missed, then the timer would expire one jiffy late - * bad luck. * * Those unlikely corner cases where the worst outcome is only a * one jiffy delay or a superfluous raise of the softirq are * not that expensive as doing the check always while holding * the lock. * * Possible remote writers are using WRITE_ONCE(). Local reader * uses therefore READ_ONCE(). */ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_timer_softirq(TIMER_SOFTIRQ); return; } } } /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. */ void update_process_times(int user_tick) { struct task_struct *p = current; /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); #endif sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; int b; for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } return 0; } int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; int b, i; for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* * The current CPUs base clock might be stale. Update it * before moving the timers over. */ forward_timer_base(new_base); WARN_ON_ONCE(old_base->running_timer); old_base->running_timer = NULL; for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); raw_spin_unlock(&old_base->lock); raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) { struct timer_base *base; int i; for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } static void __init init_timer_cpus(void) { int cpu; for_each_possible_cpu(cpu) init_timer_cpu(cpu); } void __init init_timers(void) { init_timer_cpus(); posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_MIR_H #define __NET_TC_MIR_H #include <net/act_api.h> #include <linux/tc_act/tc_mirred.h> struct tcf_mirred { struct tc_action common; int tcfm_eaction; u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR; #endif return false; } static inline bool is_tcf_mirred_ingress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_ingress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_MIRROR; #endif return false; } static inline struct net_device *tcf_mirred_dev(const struct tc_action *a) { return rtnl_dereference(to_mirred(a)->tcfm_dev); } #endif /* __NET_TC_MIR_H */
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match ESP parameters. */ /* (C) 1999-2000 Yon Uriarte <yon@astaro.de> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter/xt_esp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match"); MODULE_ALIAS("ipt_esp"); MODULE_ALIAS("ip6t_esp"); /* Returns 1 if the spi is matched by the range, 0 otherwise */ static inline bool spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip_esp_hdr *eh; struct ip_esp_hdr _esp; const struct xt_esp *espinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); if (eh == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ pr_debug("Dropping evil ESP tinygram.\n"); par->hotdrop = true; return false; } return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi), !!(espinfo->invflags & XT_ESP_INV_SPI)); } static int esp_mt_check(const struct xt_mtchk_param *par) { const struct xt_esp *espinfo = par->matchinfo; if (espinfo->invflags & ~XT_ESP_INV_MASK) { pr_debug("unknown flags %X\n", espinfo->invflags); return -EINVAL; } return 0; } static struct xt_match esp_mt_reg[] __read_mostly = { { .name = "esp", .family = NFPROTO_IPV4, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, }, { .name = "esp", .family = NFPROTO_IPV6, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, }, }; static int __init esp_mt_init(void) { return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } static void __exit esp_mt_exit(void) { xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } module_init(esp_mt_init); module_exit(esp_mt_exit);
2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/udp.h> static struct sk_buff *esp4_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct iphdr *iph = ip_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); int proto = iph->protocol; skb_push(skb, -skb_network_offset(skb)); esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } if (proto == IPPROTO_IPV6) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int err; int alen; int blksize; struct xfrm_offload *xo; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; int encap_type = 0; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if ((!(features & NETIF_F_HW_ESP) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; esp.esph = ip_esp_hdr(skb); if (x->encap) encap_type = x->encap->encap_type; if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) { /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header * points to iphdr->protocol (see xfrm4_tunnel_encap_add()). * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol. * Therefore, the protocol field needs to be corrected. */ ip_hdr(skb)->protocol = IPPROTO_UDP; esph->seq_no = htonl(seq); } ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, .gso_segment = esp4_gso_segment, }, }; static const struct xfrm_type_offload esp_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, .xmit = esp_xmit, .encap = esp4_gso_encap, }; static int __init esp4_offload_init(void) { if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
965 966 966 137 136 137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. * * Lock order: * context.ldt_usr_sem * mmap_lock * context.lock */ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/ldt.h> #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/pgtable_areas.h> #include <xen/xen.h> /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) static inline void *ldt_slot_va(int slot) { return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); } void load_mm_ldt(struct mm_struct *mm) { struct ldt_struct *ldt; /* READ_ONCE synchronizes with smp_store_release */ ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all * CPUs with the mm active. The LDT will not be freed until * after the IPI is handled by all such CPUs. This means that * if the ldt_struct changes before we return, the values we see * will be safe, and the new values will be loaded before we run * any user code. * * NB: don't try to convert this to use RCU without extreme care. * We would still need IRQs off, because we don't want to change * the local LDT after an IPI loaded a newer value than the one * that we can see. */ if (unlikely(ldt)) { if (static_cpu_has(X86_FEATURE_PTI)) { if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { /* * Whoops -- either the new LDT isn't mapped * (if slot == -1) or is mapped into a bogus * slot (if slot > 1). */ clear_LDT(); return; } /* * If page table isolation is enabled, ldt->entries * will not be mapped in the userspace pagetables. * Tell the CPU to access the LDT through the alias * at ldt_slot_va(ldt->slot). */ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); } else { set_ldt(ldt->entries, ldt->nr_entries); } } else { clear_LDT(); } } void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { /* * Load the LDT if either the old or new mm had an LDT. * * An mm will never go from having an LDT to not having an LDT. Two * mms never share an LDT, so we don't gain anything by checking to * see whether the LDT changed. There's also no guarantee that * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, * then prev->context.ldt will also be non-NULL. * * If we really cared, we could optimize the case where prev == next * and we're exiting lazy mode. Most of the time, if this happens, * we don't actually need to reload LDTR, but modify_ldt() is mostly * used by legacy code and emulators where we don't need this level of * performance. * * This uses | instead of || because it generates better code. */ if (unlikely((unsigned long)prev->context.ldt | (unsigned long)next->context.ldt)) load_mm_ldt(next); DEBUG_LOCKS_WARN_ON(preemptible()); } static void refresh_ldt_segments(void) { #ifdef CONFIG_X86_64 unsigned short sel; /* * Make sure that the cached DS and ES descriptors match the updated * LDT. */ savesegment(ds, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(ds, sel); savesegment(es, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(es, sel); #endif } /* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; load_mm_ldt(mm); refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no * trailing nonzero bytes in any page that contains LDT descriptors. * Keep it simple: zero the whole allocation and never allocate less * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); return NULL; } /* The new LDT isn't aliased for PTI yet. */ new_ldt->slot = -1; new_ldt->nr_entries = num_entries; return new_ldt; } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, bool had_user_mapping) { if (mm->context.ldt) { /* * We already had an LDT. The top-level entry should already * have been allocated and synchronized with the usermode * tables. */ WARN_ON(!had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* * This is the first time we're mapping an LDT for this process. * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } #ifdef CONFIG_X86_PAE static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) { p4d_t *p4d; pud_t *pud; if (pgd->pgd == 0) return NULL; p4d = p4d_offset(pgd, va); if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, va); if (pud_none(*pud)) return NULL; return pmd_offset(pud, va); } static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); bool had_kernel, had_user; pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); had_kernel = (k_pmd->pmd != 0); had_user = (u_pmd->pmd != 0); do_sanity_check(mm, had_kernel, had_user); } #else /* !CONFIG_X86_PAE */ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); bool had_kernel = (pgd->pgd != 0); bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); do_sanity_check(mm, had_kernel, had_user); } #endif /* CONFIG_X86_PAE */ /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { unsigned long va; bool is_vmalloc; spinlock_t *ptl; int i, nr_pages; if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* * Any given ldt_struct should have map_ldt_struct() called at most * once. */ WARN_ON(ldt->slot != -1); /* Check if the current mappings are sane */ sanity_check_ldt_mapping(mm); is_vmalloc = is_vmalloc_addr(ldt->entries); nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; pgprot_t pte_prot; pte_t pte, *ptep; va = (unsigned long)ldt_slot_va(slot) + offset; pfn = is_vmalloc ? vmalloc_to_pfn(src) : page_to_pfn(virt_to_page(src)); /* * Treat the PTI LDT range as a *userspace* range. * get_locked_pte() will allocate all needed pagetables * and account for them in this mm. */ ptep = get_locked_pte(mm, va, &ptl); if (!ptep) return -ENOMEM; /* * Map it RO so the easy to find address is not a primary * target via some kernel interface which misses a * permission check. */ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); /* Filter out unsuppored __PAGE_KERNEL* bits: */ pgprot_val(pte_prot) &= __supported_pte_mask; pte = pfn_pte(pfn, pte_prot); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } /* Propagate LDT mapping to the user page-table */ map_ldt_struct_to_user(mm); ldt->slot = slot; return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { unsigned long va; int i, nr_pages; if (!ldt) return; /* LDT map/unmap is only required for PTI */ if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; spinlock_t *ptl; pte_t *ptep; va = (unsigned long)ldt_slot_va(ldt->slot) + offset; ptep = get_locked_pte(mm, va, &ptl); if (!WARN_ON_ONCE(!ptep)) { pte_clear(mm, va, ptep); pte_unmap_unlock(ptep, ptl); } } va = (unsigned long)ldt_slot_va(ldt->slot); flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } #else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } #endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * Although free_pgd_range() is intended for freeing user * page-tables, it also works out for kernel mappings on x86. * We use tlb_gather_mmu_fullmm() to avoid confusing the * range-tracking logic in __tlb_adjust_range(). */ tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); tlb_finish_mmu(&tlb); #endif } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { mutex_lock(&mm->context.lock); /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(&mm->context.ldt, ldt); /* Activate the LDT for all CPUs using currents mm. */ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) { if (likely(!ldt)) return; paravirt_free_ldt(ldt->entries, ldt->nr_entries); if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); } /* * Called on fork from arch_dup_mmap(). Just copy the current LDT state, * the new task is not running, so nothing can be installed. */ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; int retval = 0; if (!old_mm) return 0; mutex_lock(&old_mm->context.lock); if (!old_mm->context.ldt) goto out_unlock; new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); retval = map_ldt_struct(mm, new_ldt, 0); if (retval) { free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } mm->context.ldt = new_ldt; out_unlock: mutex_unlock(&old_mm->context.lock); return retval; } /* * No need to lock the MM as we are the last user * * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context_ldt(struct mm_struct *mm) { free_ldt_struct(mm->context.ldt); mm->context.ldt = NULL; } void ldt_arch_exit_mmap(struct mm_struct *mm) { free_ldt_pgtables(mm); } static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; goto out_unlock; } if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; if (entries_size > bytecount) entries_size = bytecount; if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } } retval = bytecount; out_unlock: up_read(&mm->context.ldt_usr_sem); return retval; } static int read_default_ldt(void __user *ptr, unsigned long bytecount) { /* CHECKME: Can we use _one_ random number ? */ #ifdef CONFIG_X86_32 unsigned long size = 5 * sizeof(struct desc_struct); #else unsigned long size = 128; #endif if (bytecount > size) bytecount = size; if (clear_user(ptr, bytecount)) return -EFAULT; return bytecount; } static bool allow_16bit_segments(void) { if (!IS_ENABLED(CONFIG_X86_16BIT)) return false; #ifdef CONFIG_XEN_PV /* * Xen PV does not implement ESPFIX64, which means that 16-bit * segments will not work correctly. Until either Xen PV implements * ESPFIX64 and can signal this fact to the guest or unless someone * provides compelling evidence that allowing broken 16-bit segments * is worthwhile, disallow 16-bit segments under Xen PV. */ if (xen_pv_domain()) { pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); return false; } #endif return true; } static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; error = -EINVAL; if (ldt_info.entry_number >= LDT_ENTRIES) goto out; if (ldt_info.contents == 3) { if (oldmode) goto out; if (ldt_info.seg_not_present == 0) goto out; } if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || LDT_empty(&ldt_info)) { /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; } if (down_write_killable(&mm->context.ldt_usr_sem)) return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); /* * If we are using PTI, map the new LDT into the userspace pagetables. * If there is already an LDT, use the other slot so that other CPUs * will continue to use the old LDT until install_ldt() switches * them over to the new LDT. */ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); if (error) { /* * This only can fail for the first LDT setup. If an LDT is * already installed then the PTE page is already * populated. Mop up a half populated page table. */ if (!WARN_ON_ONCE(old_ldt)) free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } install_ldt(mm, new_ldt); unmap_ldt_struct(mm, old_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: up_write(&mm->context.ldt_usr_sem); out: return error; } SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , unsigned long , bytecount) { int ret = -ENOSYS; switch (func) { case 0: ret = read_ldt(ptr, bytecount); break; case 1: ret = write_ldt(ptr, bytecount, 1); break; case 2: ret = read_default_ldt(ptr, bytecount); break; case 0x11: ret = write_ldt(ptr, bytecount, 0); break; } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' * return type, but the ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative * return codes into the high half of the register when * taking the value from int->long. */ return (unsigned int)ret; }
27 5 21 32 25 25 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Single-block cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, "CRYPTO_INTERNAL"); static inline void cipher_crypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src, bool enc) { unsigned long alignmask = crypto_cipher_alignmask(tfm); struct cipher_alg *cia = crypto_cipher_alg(tfm); void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = enc ? cia->cia_encrypt : cia->cia_decrypt; if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { unsigned int bs = crypto_cipher_blocksize(tfm); u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(tmp, src, bs); fn(crypto_cipher_tfm(tfm), tmp, tmp); memcpy(dst, tmp, bs); } else { fn(crypto_cipher_tfm(tfm), dst, src); } } void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, true); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, "CRYPTO_INTERNAL"); void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, false); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, "CRYPTO_INTERNAL"); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher) { struct crypto_tfm *tfm = crypto_cipher_tfm(cipher); struct crypto_alg *alg = tfm->__crt_alg; struct crypto_cipher *ncipher; struct crypto_tfm *ntfm; if (alg->cra_init) return ERR_PTR(-ENOSYS); if (unlikely(!crypto_mod_get(alg))) return ERR_PTR(-ESTALE); ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC); if (IS_ERR(ntfm)) { crypto_mod_put(alg); return ERR_CAST(ntfm); } ntfm->crt_flags = tfm->crt_flags; ncipher = __crypto_cipher_cast(ntfm); return ncipher; } EXPORT_SYMBOL_GPL(crypto_clone_cipher);
1 1 4 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); struct dentry *d; d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, hsr_debugfs_root_dir, dev->name); if (IS_ERR(d)) netdev_warn(dev, "failed to rename\n"); else priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-or-later /* * Public Key Encryption * * Copyright (c) 2015, Intel Corporation * Authors: Tadeusz Struk <tadeusz.struk@intel.com> */ #include <crypto/internal/akcipher.h> #include <linux/cryptouser.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e struct crypto_akcipher_sync_data { struct crypto_akcipher *tfm; const void *src; void *dst; unsigned int slen; unsigned int dlen; struct akcipher_request *req; struct crypto_wait cwait; struct scatterlist sg; u8 *buf; }; static int __maybe_unused crypto_akcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_akcipher rakcipher; memset(&rakcipher, 0, sizeof(rakcipher)); strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(rakcipher), &rakcipher); } static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : akcipher\n"); } static void crypto_akcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm); struct akcipher_alg *alg = crypto_akcipher_alg(akcipher); alg->exit(akcipher); } static int crypto_akcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm); struct akcipher_alg *alg = crypto_akcipher_alg(akcipher); if (alg->exit) akcipher->base.exit = crypto_akcipher_exit_tfm; if (alg->init) return alg->init(akcipher); return 0; } static void crypto_akcipher_free_instance(struct crypto_instance *inst) { struct akcipher_instance *akcipher = akcipher_instance(inst); akcipher->free(akcipher); } static const struct crypto_type crypto_akcipher_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_akcipher_init_tfm, .free = crypto_akcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_akcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_akcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_AHASH_MASK, .type = CRYPTO_ALG_TYPE_AKCIPHER, .tfmsize = offsetof(struct crypto_akcipher, base), }; int crypto_grab_akcipher(struct crypto_akcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_akcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_akcipher); struct crypto_akcipher *crypto_alloc_akcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_akcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_akcipher); static void akcipher_prepare_alg(struct akcipher_alg *alg) { struct crypto_alg *base = &alg->base; base->cra_type = &crypto_akcipher_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_AKCIPHER; } static int akcipher_default_op(struct akcipher_request *req) { return -ENOSYS; } static int akcipher_default_set_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { return -ENOSYS; } int crypto_register_akcipher(struct akcipher_alg *alg) { struct crypto_alg *base = &alg->base; if (!alg->encrypt) alg->encrypt = akcipher_default_op; if (!alg->decrypt) alg->decrypt = akcipher_default_op; if (!alg->set_priv_key) alg->set_priv_key = akcipher_default_set_key; akcipher_prepare_alg(alg); return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_akcipher); void crypto_unregister_akcipher(struct akcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_akcipher); int akcipher_register_instance(struct crypto_template *tmpl, struct akcipher_instance *inst) { if (WARN_ON(!inst->free)) return -EINVAL; akcipher_prepare_alg(&inst->alg); return crypto_register_instance(tmpl, akcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(akcipher_register_instance); static int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data) { unsigned int reqsize = crypto_akcipher_reqsize(data->tfm); struct akcipher_request *req; struct scatterlist *sg; unsigned int mlen; unsigned int len; u8 *buf; mlen = max(data->slen, data->dlen); len = sizeof(*req) + reqsize + mlen; if (len < mlen) return -EOVERFLOW; req = kzalloc(len, GFP_KERNEL); if (!req) return -ENOMEM; data->req = req; akcipher_request_set_tfm(req, data->tfm); buf = (u8 *)(req + 1) + reqsize; data->buf = buf; memcpy(buf, data->src, data->slen); sg = &data->sg; sg_init_one(sg, buf, mlen); akcipher_request_set_crypt(req, sg, sg, data->slen, data->dlen); crypto_init_wait(&data->cwait); akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &data->cwait); return 0; } static int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err) { err = crypto_wait_req(err, &data->cwait); memcpy(data->dst, data->buf, data->dlen); data->dlen = data->req->dst_len; kfree_sensitive(data->req); return err; } int crypto_akcipher_sync_encrypt(struct crypto_akcipher *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct crypto_akcipher_sync_data data = { .tfm = tfm, .src = src, .dst = dst, .slen = slen, .dlen = dlen, }; return crypto_akcipher_sync_prep(&data) ?: crypto_akcipher_sync_post(&data, crypto_akcipher_encrypt(data.req)); } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_encrypt); int crypto_akcipher_sync_decrypt(struct crypto_akcipher *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct crypto_akcipher_sync_data data = { .tfm = tfm, .src = src, .dst = dst, .slen = slen, .dlen = dlen, }; return crypto_akcipher_sync_prep(&data) ?: crypto_akcipher_sync_post(&data, crypto_akcipher_decrypt(data.req)) ?: data.dlen; } EXPORT_SYMBOL_GPL(crypto_akcipher_sync_decrypt); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic public key cipher type");
15 14 15 15 15 6 3 2 1 1 1 4 11 11 9 1 1 7 6 5 6 6 2 2 3 8 9 9 2 2 2 2 9 9 8 1 2 5 5 2 2 4 8 881 887 12 1 20 12 20 3 1 1 1 1 1 10 2 1 3 5 5 3 7 3 2 14 2 9 4 4 9 12 2 7 2 2 4 3 3 5 9 5 4 3 6 8 3 3 3 1 3 1 1 1 1 1 1 1 3 3 3 3 3 1 3 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Unlabeled Support * * This file defines functions for dealing with unlabeled packets for the * NetLabel system. The NetLabel system manages static and dynamic label * mappings for network protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_mgmt.h" /* NOTE: at present we always use init's network namespace since we don't * presently support different namespaces even though the majority of * the functions in this file are "namespace safe" */ /* The unlabeled connection hash table which we use to map network interfaces * and addresses of unlabeled packets to a user specified secid value for the * LSM. The hash table is used to lookup the network interface entry * (struct netlbl_unlhsh_iface) and then the interface entry is used to * lookup an IP address match from an ordered list. If a network interface * match can not be found in the hash table then the default entry * (netlbl_unlhsh_def) is used. The IP address entry list * (struct netlbl_unlhsh_addr) is ordered such that the entries with a * larger netmask come first. */ struct netlbl_unlhsh_tbl { struct list_head *tbl; u32 size; }; #define netlbl_unlhsh_addr4_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr4, list) struct netlbl_unlhsh_addr4 { u32 secid; struct netlbl_af4list list; struct rcu_head rcu; }; #define netlbl_unlhsh_addr6_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr6, list) struct netlbl_unlhsh_addr6 { u32 secid; struct netlbl_af6list list; struct rcu_head rcu; }; struct netlbl_unlhsh_iface { int ifindex; struct list_head addr4_list; struct list_head addr6_list; u32 valid; struct list_head list; struct rcu_head rcu; }; /* Argument struct for netlbl_unlhsh_walk() */ struct netlbl_unlhsh_walk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Unlabeled connection hash table */ /* updates should be so rare that having one spinlock for the entire * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } }; /* * Unlabeled Connection Hash Table Functions */ /** * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that memory allocated to a hash table interface entry can be * released safely. It is important to note that this function does not free * the IPv4 and IPv6 address lists contained as part of an interface entry. It * is up to the rest of the code to make sure an interface entry is only freed * once it's address lists are empty. * */ static void netlbl_unlhsh_free_iface(struct rcu_head *entry) { struct netlbl_unlhsh_iface *iface; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); /* no need for locks here since we are the only one with access to this * structure */ netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_unlhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_unlhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(iface); } /** * netlbl_unlhsh_hash - Hashing function for the hash table * @ifindex: the network interface/device to hash * * Description: * This is the hashing function for the unlabeled hash table, it returns the * bucket number for the given device/interface. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or * the hash table lock. * */ static u32 netlbl_unlhsh_hash(int ifindex) { return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); } /** * netlbl_unlhsh_search_iface - Search for a matching interface entry * @ifindex: the network interface * * Description: * Searches the unlabeled connection hash table and returns a pointer to the * interface entry which matches @ifindex, otherwise NULL is returned. The * caller is responsible for ensuring that the hash table is protected with * either a RCU read lock or the hash table lock. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) { u32 bkt; struct list_head *bkt_list; struct netlbl_unlhsh_iface *iter; bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; return NULL; } /** * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table * @iface: the associated interface entry * @addr: IPv4 address in network byte order * @mask: IPv4 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr4 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = addr->s_addr & mask->s_addr; entry->list.mask = mask->s_addr; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return ret_val; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table * @iface: the associated interface entry * @addr: IPv6 address in network byte order * @mask: IPv6 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr6 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = *addr; entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; entry->list.mask = *mask; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table * @ifindex: network interface * * Description: * Add a new, empty, interface entry into the unlabeled connection hash table. * On success a pointer to the new interface entry is returned, on failure NULL * is returned. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) { u32 bkt; struct netlbl_unlhsh_iface *iface; iface = kzalloc(sizeof(*iface), GFP_ATOMIC); if (iface == NULL) return NULL; iface->ifindex = ifindex; INIT_LIST_HEAD(&iface->addr4_list); INIT_LIST_HEAD(&iface->addr6_list); iface->valid = 1; spin_lock(&netlbl_unlhsh_lock); if (ifindex > 0) { bkt = netlbl_unlhsh_hash(ifindex); if (netlbl_unlhsh_search_iface(ifindex) != NULL) goto add_iface_failure; list_add_tail_rcu(&iface->list, &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&iface->list); if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) goto add_iface_failure; rcu_assign_pointer(netlbl_unlhsh_def, iface); } spin_unlock(&netlbl_unlhsh_lock); return iface; add_iface_failure: spin_unlock(&netlbl_unlhsh_lock); kfree(iface); return NULL; } /** * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the unlabeled connection hash table. Returns zero on * success, negative values on failure. * */ int netlbl_unlhsh_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, u32 secid, struct netlbl_audit *audit_info) { int ret_val; int ifindex; struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; char *secctx = NULL; u32 secctx_len; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_add_return; } ifindex = dev->ifindex; iface = netlbl_unlhsh_search_iface(ifindex); } else { ifindex = 0; iface = rcu_dereference(netlbl_unlhsh_def); } if (iface == NULL) { iface = netlbl_unlhsh_add_iface(ifindex); if (iface == NULL) { ret_val = -ENOMEM; goto unlhsh_add_return; } } audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, audit_info); switch (addr_len) { case sizeof(struct in_addr): { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); if (audit_buf != NULL) netlbl_af4list_audit_addr(audit_buf, 1, dev_name, addr4->s_addr, mask4->s_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); if (audit_buf != NULL) netlbl_af6list_audit_addr(audit_buf, 1, dev_name, addr6, mask6); break; } #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { if (security_secid_to_secctx(secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr4_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr6_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_condremove_iface - Remove an interface entry * @iface: the interface entry * * Description: * Remove an interface entry from the unlabeled connection hash table if it is * empty. An interface entry is considered to be empty if there are no * address entries assigned to it. * */ static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) { struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ spin_lock(&netlbl_unlhsh_lock); netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) goto unlhsh_condremove_failure; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) goto unlhsh_condremove_failure; #endif /* IPv6 */ iface->valid = 0; if (iface->ifindex > 0) list_del_rcu(&iface->list); else RCU_INIT_POINTER(netlbl_unlhsh_def, NULL); spin_unlock(&netlbl_unlhsh_lock); call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return; unlhsh_condremove_failure: spin_unlock(&netlbl_unlhsh_lock); } /** * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @audit_info: NetLabel audit information * * Description: * Removes and existing entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ int netlbl_unlhsh_remove(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, struct netlbl_audit *audit_info) { int ret_val; struct net_device *dev; struct netlbl_unlhsh_iface *iface; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_remove_return; } iface = netlbl_unlhsh_search_iface(dev->ifindex); } else iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL) { ret_val = -ENOENT; goto unlhsh_remove_return; } switch (addr_len) { case sizeof(struct in_addr): ret_val = netlbl_unlhsh_remove_addr4(net, iface, addr, mask, audit_info); break; #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): ret_val = netlbl_unlhsh_remove_addr6(net, iface, addr, mask, audit_info); break; #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) { netlbl_unlhsh_condremove_iface(iface); atomic_dec(&netlabel_mgmt_protocount); } unlhsh_remove_return: rcu_read_unlock(); return ret_val; } /* * General Helper Functions */ /** * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a * network device going away. In the case of a device going away we clear any * related entries from the unlabeled connection hash table. * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ if (event == NETDEV_DOWN) { spin_lock(&netlbl_unlhsh_lock); iface = netlbl_unlhsh_search_iface(dev->ifindex); if (iface != NULL && iface->valid) { iface->valid = 0; list_del_rcu(&iface->list); } else iface = NULL; spin_unlock(&netlbl_unlhsh_lock); } if (iface != NULL) call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return NOTIFY_DONE; } /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag * @value: desired value * @audit_info: NetLabel audit information * * Description: * Set the value of the unlabeled accept flag to @value. * */ static void netlbl_unlabel_acceptflg_set(u8 value, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; u8 old_val; old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " unlbl_accept=%u old=%u", value, old_val); audit_log_end(audit_buf); } } /** * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information * @info: the Generic NETLINK info block * @addr: the IP address * @mask: the IP address mask * @len: the address length * * Description: * Examine the Generic NETLINK message and extract the IP address information. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, void **addr, void **mask, u32 *len) { u32 addr_len; if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); return 0; } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); if (addr_len != sizeof(struct in6_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); return 0; } return -EINVAL; } /* * NetLabel Command Handlers */ /** * netlbl_unlabel_accept - Handle an ACCEPT message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ACCEPT message and set the accept flag accordingly. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) { u8 value; struct netlbl_audit audit_info; if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } } return -EINVAL; } /** * netlbl_unlabel_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond with the current status. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct sk_buff *ans_skb; void *data; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) goto list_failure; data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, 0, NLBL_UNLABEL_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); if (ret_val != 0) goto list_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_unlabel_staticadd - Handle a STATICADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADD message and add a new unlabeled * connection entry to the hash table. Returns zero on success, negative * values on failure. * */ static int netlbl_unlabel_staticadd(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, dev_name, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADDDEF message and add a new default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, NULL, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticremove - Handle a STATICREMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVE message and remove the specified * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremove(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); return netlbl_unlhsh_remove(&init_net, dev_name, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVEDEF message and remove the default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; return netlbl_unlhsh_remove(&init_net, NULL, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] * @cmd: command/message * @iface: the interface entry * @addr4: the IPv4 address entry * @addr6: the IPv6 address entry * @arg: the netlbl_unlhsh_walk_arg structure * * Description: * This function is designed to be used to generate a response for a * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 * can be specified, not both, the other unspecified entry should be set to * NULL by the caller. Returns the size of the message on success, negative * values on failure. * */ static int netlbl_unlabel_staticlist_gen(u32 cmd, const struct netlbl_unlhsh_iface *iface, const struct netlbl_unlhsh_addr4 *addr4, const struct netlbl_unlhsh_addr6 *addr6, void *arg) { int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; void *data; u32 secid; char *secctx; u32 secctx_len; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, NLM_F_MULTI, cmd); if (data == NULL) goto list_cb_failure; if (iface->ifindex > 0) { dev = dev_get_by_index(&init_net, iface->ifindex); if (!dev) { ret_val = -ENODEV; goto list_cb_failure; } ret_val = nla_put_string(cb_arg->skb, NLBL_UNLABEL_A_IFACE, dev->name); dev_put(dev); if (ret_val != 0) goto list_cb_failure; } if (addr4) { struct in_addr addr_struct; addr_struct.s_addr = addr4->list.addr; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4ADDR, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; addr_struct.s_addr = addr4->list.mask; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4MASK, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; secid = addr4->secid; } else { ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6ADDR, &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6MASK, &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; secid = addr6->secid; } ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, secctx_len, secctx); security_release_secctx(secctx, secctx_len); if (ret_val != 0) goto list_cb_failure; cb_arg->seq++; genlmsg_end(cb_arg->skb, data); return 0; list_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_unlabel_staticlist - Handle a STATICLIST message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLIST message and dump the unlabeled * connection hash table in a form suitable for use in a kernel generated * STATICLIST message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; u32 skip_addr4 = cb->args[2]; u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || iter_chain++ < skip_chain) continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr4 = 0; skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr6 = 0; skip_addr6 = 0; #endif /* IPv6 */ } iter_chain = 0; skip_chain = 0; } unlabel_staticlist_return: rcu_read_unlock(); cb->args[0] = iter_bkt; cb->args[1] = iter_chain; cb->args[2] = iter_addr4; cb->args[3] = iter_addr6; return skb->len; } /** * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLISTDEF message and dump the default * unlabeled connection entry in a form suitable for use in a kernel generated * STATICLISTDEF message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; struct netlbl_unlhsh_iface *iface; u32 iter_addr4 = 0, iter_addr6 = 0; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_staticlistdef_return; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < cb->args[0]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; goto unlabel_staticlistdef_return; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < cb->args[1]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; goto unlabel_staticlistdef_return; } } #endif /* IPv6 */ unlabel_staticlistdef_return: rcu_read_unlock(); cb->args[0] = iter_addr4; cb->args[1] = iter_addr6; return skb->len; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_unlabel_genl_ops[] = { { .cmd = NLBL_UNLABEL_C_STATICADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadd, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremove, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlist, }, { .cmd = NLBL_UNLABEL_C_STATICADDDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadddef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremovedef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLISTDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlistdef, }, { .cmd = NLBL_UNLABEL_C_ACCEPT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_accept, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_unlabel_list, .dumpit = NULL, }, }; static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_UNLABEL_A_MAX, .policy = netlbl_unlabel_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_unlabel_genl_ops, .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), .resv_start_op = NLBL_UNLABEL_C_STATICLISTDEF + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component * * Description: * Register the unlabeled packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_unlabel_genl_init(void) { return genl_register_family(&netlbl_unlabel_gnl_family); } /* * NetLabel KAPI Hooks */ static struct notifier_block netlbl_unlhsh_netdev_notifier = { .notifier_call = netlbl_unlhsh_netdev_handler, }; /** * netlbl_unlabel_init - Initialize the unlabeled connection hash table * @size: the number of bits to use for the hash buckets * * Description: * Initializes the unlabeled connection hash table and registers a network * device notification handler. This function should only be called by the * NetLabel subsystem itself during initialization. Returns zero on success, * non-zero values on error. * */ int __init netlbl_unlabel_init(u32 size) { u32 iter; struct netlbl_unlhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_unlhsh_lock); rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); spin_unlock(&netlbl_unlhsh_lock); register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); return 0; } /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Determine the security attributes, if any, for an unlabled packet and return * them in @secattr. Returns zero on success and negative values on failure. * */ int netlbl_unlabel_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { struct netlbl_unlhsh_iface *iface; rcu_read_lock(); iface = netlbl_unlhsh_search_iface(skb->skb_iif); if (iface == NULL) iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_getattr_nolabel; #if IS_ENABLED(CONFIG_IPV6) /* When resolving a fallback label, check the sk_buff version as * it is possible (e.g. SCTP) to have family = PF_INET6 while * receiving ip_hdr(skb)->version = 4. */ if (family == PF_INET6 && ip_hdr(skb)->version == 4) family = PF_INET; #endif /* IPv6 */ switch (family) { case PF_INET: { struct iphdr *hdr4; struct netlbl_af4list *addr4; hdr4 = ip_hdr(skb); addr4 = netlbl_af4list_search(hdr4->saddr, &iface->addr4_list); if (addr4 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; break; } #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: { struct ipv6hdr *hdr6; struct netlbl_af6list *addr6; hdr6 = ipv6_hdr(skb); addr6 = netlbl_af6list_search(&hdr6->saddr, &iface->addr6_list); if (addr6 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; break; } #endif /* IPv6 */ default: goto unlabel_getattr_nolabel; } rcu_read_unlock(); secattr->flags |= NETLBL_SECATTR_SECID; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; unlabel_getattr_nolabel: rcu_read_unlock(); if (netlabel_unlabel_acceptflg == 0) return -ENOMSG; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; } /** * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets * * Description: * Set the default NetLabel configuration to allow incoming unlabeled packets * and to send unlabeled network traffic by default. * */ int __init netlbl_unlabel_defconf(void) { int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ security_current_getlsmprop_subj(&audit_info.prop); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) return ret_val; netlbl_unlabel_acceptflg_set(1, &audit_info); return 0; }
21679 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fault-inject.h> #include <linux/fault-inject-usercopy.h> static struct { struct fault_attr attr; } fail_usercopy = { .attr = FAULT_ATTR_INITIALIZER, }; static int __init setup_fail_usercopy(char *str) { return setup_fault_attr(&fail_usercopy.attr, str); } __setup("fail_usercopy=", setup_fail_usercopy); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_usercopy_debugfs(void) { struct dentry *dir; dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); if (IS_ERR(dir)) return PTR_ERR(dir); return 0; } late_initcall(fail_usercopy_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ bool should_fail_usercopy(void) { return should_fail(&fail_usercopy.attr, 1); } EXPORT_SYMBOL_GPL(should_fail_usercopy);
15 14 44 43 43 40 2 39 2 1 43 1 44 26 41 3 42 1 1 44 44 44 42 1 2 41 44 45 44 46 46 45 43 2 43 1 45 45 1 46 4 4 4 42 2 39 4 4 4 4 4 4 4 4 4 4 1 4 24 26 26 27 27 7 7 7 3 3 3 3 3 3 3 3 3 26 27 27 27 26 1 26 24 24 27 26 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { struct ieee80211_bss_conf *link_conf = NULL; /* for an MLO connection, set the TSF data only in case we have * an indication on which of the links the frame was received */ if (ieee80211_vif_is_mld(&scan_sdata->vif)) { if (rx_status->link_valid) { s8 link_id = rx_status->link_id; link_conf = rcu_dereference(scan_sdata->vif.link_conf[link_id]); } } else { link_conf = &scan_sdata->vif.bss_conf; } if (link_conf) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, link_conf->bssid); } } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { if (!sdata) return false; /* accept broadcast on 6 GHz and for OCE */ if (is_broadcast_ether_addr(da) && (channel->band == NL80211_BAND_6GHZ || scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; return ether_addr_equal(da, sdata->vif.addr); } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); } if (skb->len < min_hdr_len) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct ieee80211_sub_if_data *sdata1, *sdata2; struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, channel, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, channel, sched_scan_req_flags, mgmt->da)) return; } else { /* Beacons are expected only with broadcast address */ if (!is_broadcast_ether_addr(mgmt->da)) return; } /* Do not update the BSS table in case of only monitor interfaces */ if (local->open_count == local->monitors) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen; u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { local->hw_scan_req->req.n_channels = req->n_channels; for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = &local->hw_scan_req->req.n_channels; *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!*n_chans); } ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); if (ielen < 0) return false; local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_conf_chan(local); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (!local->emulate_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_conf_chan(local); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { for_each_valid_link(&sdata_iter->wdev, link_id) if (sdata_iter->wdev.links[link_id].cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)))) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = msecs_to_jiffies(scan_req->duration) > IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME ? msecs_to_jiffies(scan_req->duration) - IEEE80211_PROBE_DELAY : IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it * is indeed active. */ if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, req.channels, req->n_channels) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; /* None of the channels are actually set * up but let UBSAN know the boundaries. */ local->hw_scan_req->req.n_channels = req->n_channels; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_req->req.tsf_report_link_id = req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->hw.conf.chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_conf_chan(local); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For scanning on the S1G band, detect the channel width according to * the channel being scanned. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = ieee80211_s1g_channel_width(chan); goto set_channel; } /* If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) local->scan_chandef = local->hw.conf.chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_conf_chan(local)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = max(msecs_to_jiffies(scan_req->duration), IEEE80211_PASSIVE_CHANNEL_TIME); local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_conf_chan(local); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) return -EBUSY; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, tmp_ch)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) return -EINVAL; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, channels[i])) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (n_ch == 0) return -EINVAL; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; return __ieee80211_start_scan(sdata, sdata->local->int_scan_req); } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ret = ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); if (ret < 0) goto error; ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } error: kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
100 4 8 138 239 31 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/mm.h> #include <asm/io.h> struct scatterlist { unsigned long page_link; unsigned int offset; unsigned int length; dma_addr_t dma_address; #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif #ifdef CONFIG_NEED_SG_DMA_FLAGS unsigned int dma_flags; #endif }; /* * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries dma_map_sg * returns, or alternatively stop on the first sg_dma_len(sg) which * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) #ifdef CONFIG_NEED_SG_DMA_LENGTH #define sg_dma_len(sg) ((sg)->dma_length) #else #define sg_dma_len(sg) ((sg)->length) #endif struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ unsigned int orig_nents; /* original size of list */ }; struct sg_append_table { struct sg_table sgt; /* The scatter list table */ struct scatterlist *prv; /* last populated sge in the table */ unsigned int total_nents; /* Total entries in the table */ }; /* * Notes on SG table design. * * We use the unsigned long page_link field in the scatterlist struct to place * the page pointer AND encode information about the sg table as well. The two * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. * * If bit 1 is set, then this sg entry is the last element in a list. * * See sg_next(). * */ #define SG_CHAIN 0x01UL #define SG_END 0x02UL /* * We overload the LSB of the page pointer to indicate whether it's * a valid sg entry, or whether it points to the start of a new scatterlist. * Those low bits are there for everyone! (thanks mason :-) */ #define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END) static inline unsigned int __sg_flags(struct scatterlist *sg) { return sg->page_link & SG_PAGE_LINK_MASK; } static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg) { return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK); } static inline bool sg_is_chain(struct scatterlist *sg) { return __sg_flags(sg) & SG_CHAIN; } static inline bool sg_is_last(struct scatterlist *sg) { return __sg_flags(sg) & SG_END; } /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry * @page: The page * * Description: * Assign page to sg entry. Also see sg_set_page(), the most commonly used * variant. * **/ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) { unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END); /* * In order for the low bit stealing approach to work, pages * must be aligned at a 32-bit boundary as a minimum. */ BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK); #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; } /** * sg_set_page - Set sg entry to point at given page * @sg: SG entry * @page: The page * @len: Length of data * @offset: Offset into page * * Description: * Use this function to set an sg entry pointing at a page, never assign * the page directly. We encode sg table information in the lower bits * of the page pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { sg_assign_page(sg, page); sg->offset = offset; sg->length = len; } /** * sg_set_folio - Set sg entry to point at given folio * @sg: SG entry * @folio: The folio * @len: Length of data * @offset: Offset into folio * * Description: * Use this function to set an sg entry pointing at a folio, never assign * the folio directly. We encode sg table information in the lower bits * of the folio pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(len > UINT_MAX); WARN_ON_ONCE(offset > UINT_MAX); sg_assign_page(sg, &folio->page); sg->offset = offset; sg->length = len; } static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK); } /** * sg_set_buf - Set sg entry to point at given data * @sg: SG entry * @buf: Data * @buflen: Data length * **/ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { #ifdef CONFIG_DEBUG_SG BUG_ON(!virt_addr_valid(buf)); #endif sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } /* * Loop over each sg element, following the pointer to a new list if necessary */ #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) /* * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) static inline void __sg_chain(struct scatterlist *chain_sg, struct scatterlist *sgl) { /* * offset and length are unused for chain entry. Clear them. */ chain_sg->offset = 0; chain_sg->length = 0; /* * Set lowest bit to indicate a link pointer, and make sure to clear * the termination bit if it happens to be set. */ chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END; } /** * sg_chain - Chain two sglists together * @prv: First scatterlist * @prv_nents: Number of entries in prv * @sgl: Second scatterlist * * Description: * Links @prv@ and @sgl@ together, to form a longer scatterlist. * **/ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { __sg_chain(&prv[prv_nents - 1], sgl); } /** * sg_mark_end - Mark the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Marks the passed in sg entry as the termination point for the sg * table. A call to sg_next() on this entry will return NULL. * **/ static inline void sg_mark_end(struct scatterlist *sg) { /* * Set termination bit, clear potential chain bit */ sg->page_link |= SG_END; sg->page_link &= ~SG_CHAIN; } /** * sg_unmark_end - Undo setting the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Removes the termination marker from the given entry of the scatterlist. * **/ static inline void sg_unmark_end(struct scatterlist *sg) { sg->page_link &= ~SG_END; } /* * On 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA * flags bits to indicate when a specific dma address is a bus address or the * buffer may have been bounced via SWIOTLB. */ #ifdef CONFIG_NEED_SG_DMA_FLAGS #define SG_DMA_BUS_ADDRESS (1 << 0) #define SG_DMA_SWIOTLB (1 << 1) /** * sg_dma_is_bus_address - Return whether a given segment was marked * as a bus address * @sg: SG entry * * Description: * Returns true if sg_dma_mark_bus_address() has been called on * this segment. **/ static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_BUS_ADDRESS; } /** * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Marks the passed in sg entry to indicate that the dma_address is * a bus address and doesn't need to be unmapped. This should only be * used by dma_map_sg() implementations to mark bus addresses * so they can be properly cleaned up in dma_unmap_sg(). **/ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_BUS_ADDRESS; } /** * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Clears the bus address mark. **/ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { sg->dma_flags &= ~SG_DMA_BUS_ADDRESS; } /** * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB * bouncing * @sg: SG entry * * Description: * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all * elements may have been bounced, so the caller would have to check * individual SG entries with swiotlb_find_pool(). */ static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_SWIOTLB; } /** * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing * @sg: SG entry * * Description: * Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be * bounced. */ static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_SWIOTLB; } #else static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { } static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { } #endif /* CONFIG_NEED_SG_DMA_FLAGS */ /** * sg_phys - Return physical address of an sg entry * @sg: SG entry * * Description: * This calls page_to_phys() on the page in this sg entry, and adds the * sg offset. The caller must know that it is legal to call page_to_phys() * on the sg page. * **/ static inline dma_addr_t sg_phys(struct scatterlist *sg) { return page_to_phys(sg_page(sg)) + sg->offset; } /** * sg_virt - Return virtual address of an sg entry * @sg: SG entry * * Description: * This calls page_address() on the page in this sg entry, and adds the * sg offset. The caller must know that the sg page has a valid virtual * mapping. * **/ static inline void *sg_virt(struct scatterlist *sg) { return page_address(sg_page(sg)) + sg->offset; } /** * sg_init_marker - Initialize markers in sg table * @sgl: The SG table * @nents: Number of entries in table * **/ static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { sg_mark_end(&sgl[nents - 1]); } int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); int sg_split(struct scatterlist *in, const int in_mapped_nents, const off_t skip, const int nb_splits, const size_t *split_sizes, struct scatterlist **out, int *out_mapped_nents, gfp_t gfp_mask); typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask); /** * sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node. A user * may provide an offset at a start and a size of valid data in a buffer * specified by the page array. The returned sg table is released by * sg_free_table. * * Returns: * 0 on success, negative error on failure */ static inline int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask) { return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, size, UINT_MAX, gfp_mask); } #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p); struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p); void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); void sgl_free_order(struct scatterlist *sgl, int order); void sgl_free(struct scatterlist *sgl); #endif /* CONFIG_SGL_ALLOC */ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if * a list larger than this is required then chaining will be utilized. */ #define SG_MAX_SINGLE_ALLOC (PAGE_SIZE / sizeof(struct scatterlist)) /* * The maximum number of SG segments that we will put inside a * scatterlist (unless chaining is used). Should ideally fit inside a * single page, to avoid a higher order allocation. We could define this * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The * minimum value is 32 */ #define SG_CHUNK_SIZE 128 /* * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SG_MAX_SEGMENTS SG_CHUNK_SIZE #else #define SG_MAX_SEGMENTS 2048 #endif #ifdef CONFIG_SG_POOL void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk); int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk); #endif /* * sg page iterator * * Iterates over sg entries page-by-page. On each successful iteration, you * can call sg_page_iter_page(@piter) to get the current page. * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to * the page's page offset within the sg. The iteration will stop either when a * maximum number of sg entries was reached or a terminating sg * (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ unsigned int sg_pgoffset; /* page offset within the sg */ /* these are internal states, keep away */ unsigned int __nents; /* remaining sg entries */ int __pg_advance; /* nr pages to advance at the * next step */ }; /* * sg page iterator for DMA addresses * * This is the same as sg_page_iter however you can call * sg_page_iter_dma_address(@dma_iter) to get the page's DMA * address. sg_page_iter_page() cannot be called on this iterator. */ struct sg_dma_page_iter { struct sg_page_iter base; }; bool __sg_page_iter_next(struct sg_page_iter *piter); bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); /** * sg_page_iter_page - get the current page held by the page iterator * @piter: page iterator holding the page */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { return nth_page(sg_page(piter->sg), piter->sg_pgoffset); } /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. * @dma_iter: page iterator holding the page */ static inline dma_addr_t sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { return sg_dma_address(dma_iter->base.sg) + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** * for_each_sg_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) /** * for_each_sgtable_page - iterate over all pages in the sg_table object * @sgt: sg_table object to iterate over * @piter: page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all memory pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object * @sgt: sg_table object to iterate over * @dma_iter: DMA page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all DMA mapped pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* * Mapping sg iterator * * Iterates over sg entries mapping page-by-page. On each successful * iteration, @miter->page points to the mapped page and * @miter->length bytes of data can be accessed at @miter->addr. As * long as an iteration is enclosed between start and stop, the user * is free to choose control structure and when to stop. * * @miter->consumed is set to @miter->length on each iteration. It * can be adjusted if the user can't consume all the bytes in one go. * Also, a stopped iteration can be resumed by calling next on it. * This is useful when iteration needs to release all resources and * continue later (e.g. at the next interrupt). */ #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ #define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ #define SG_MITER_FROM_SG (1 << 2) /* nop */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ struct page *page; /* currently mapped page */ void *addr; /* pointer to the mapped area */ size_t length; /* length of the mapped area */ size_t consumed; /* number of consumed bytes */ struct sg_page_iter piter; /* page iterator */ /* these are internal states, keep away */ unsigned int __offset; /* offset within page */ unsigned int __remaining; /* remaining bytes on page */ unsigned int __flags; }; void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags); bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset); bool sg_miter_next(struct sg_mapping_iter *miter); void sg_miter_stop(struct sg_mapping_iter *miter); #endif /* _LINUX_SCATTERLIST_H */
14 190 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H #include <linux/poison.h> #include <linux/const.h> /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. * (up to 2^31 different values guaranteed on all platforms) * * In the standard hlist, termination of a list is the NULL pointer. * In this special 'nulls' variant, we use the fact that objects stored in * a list are aligned on a word (4 or 8 bytes alignment). * We therefore use the last significant bit of 'ptr' : * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) * Set to 0 : This is a pointer to some object (ptr) */ struct hlist_nulls_head { struct hlist_nulls_node *first; }; struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_nulls_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested * */ static inline int is_a_nulls(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr & 1); } /** * get_nulls_value - Get the 'nulls' value of the end of chain * @ptr: end of chain * * Should be called only if is_a_nulls(ptr); */ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr) >> 1; } /** * hlist_nulls_unhashed - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } /** * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this * function may be used locklessly. */ static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) { return !READ_ONCE(h->pprev); } static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; struct hlist_nulls_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry_from(tpos, pos, member) \ for (; (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif
41 10 6 1 5 46 17 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #include <net/xdp.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string(msg, msg) ), TP_fast_assign( __assign_str(msg); ), TP_printk("errmsg=%s", __get_str(msg)) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h>
2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif int br_validate_ipv6(struct net *net, struct sk_buff *skb) { const struct ipv6hdr *hdr; struct inet6_dev *idev = __in6_dev_get(skb->dev); u32 pkt_len; u8 ip6h_len = sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, ip6h_len)) goto inhdr_error; if (skb->len < ip6h_len) goto drop; hdr = ipv6_hdr(skb); if (hdr->version != 6) goto inhdr_error; pkt_len = ntohs(hdr->payload_len); if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len)) goto drop; if (pkt_len + ip6h_len > skb->len) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); /* No IP options in IPv6 header; however it should be * checked if some next headers need special treatment */ return 0; inhdr_error: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: return -1; } static inline bool br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, sizeof(ipv6_hdr(skb)->daddr)) != 0; } /* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables * PREROUTING and continue the bridge PRE_ROUTING hook. See comment * for br_nf_pre_routing_finish(), same logic is used here but * equivalent IPv6 function ip6_route_input() called indirectly. */ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; struct net_device *dev = skb->dev, *br_indev; const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { kfree_skb(skb); return 0; } nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { skb_dst_drop(skb); v6ops->route_input(skb); if (skb_dst(skb)->error) { kfree_skb(skb); return 0; } if (skb_dst(skb)->dev == dev) { skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } else { rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } /* Replicate the checks that IPv6 does on packet reception and pass the packet * to ip6tables. */ unsigned int br_nf_pre_routing_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; if (br_validate_ipv6(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); nf_bridge = nf_bridge_alloc(skb); if (!nf_bridge) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); if (!setup_pre_routing(skb, state->net)) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IPV6); skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_ipv6); return NF_STOLEN; }
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 22 32 33 3 33 34 34 3 3 3 5 5 5 5 5 5 5 3 5 1 10 4 5 4 1 5 4 4 5 15 10 5 5 2 2 2 2 2 2 2 2 2 2 2 2 1 15 23 15 2 15 1 5 3 4 4 4 4 1 1 8 8 8 6 2 8 8 8 8 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 // SPDX-License-Identifier: GPL-2.0-only /* * IBSS mode implementation * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH * Copyright(c) 2018-2024 Intel Corporation */ #include <linux/delay.h> #include <linux/slab.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #define IEEE80211_SCAN_INTERVAL (2 * HZ) #define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) #define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) #define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) #define IEEE80211_IBSS_RSN_INACTIVITY_LIMIT (10 * HZ) #define IEEE80211_IBSS_MAX_STA_ENTRIES 128 static struct beacon_data * ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, const int beacon_int, const u32 basic_rates, const u16 capability, u64 tsf, struct cfg80211_chan_def *chandef, bool *have_higher_than_11mbit, struct cfg80211_csa_settings *csa_settings) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; int rates_n = 0, i, ri; struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; u32 rate_flags, rates = 0, rates_added = 0; struct beacon_data *presp; int frame_len; /* Build IBSS probe response */ frame_len = sizeof(struct ieee80211_hdr_3addr) + 12 /* struct ieee80211_mgmt.u.beacon */ + 2 + IEEE80211_MAX_SSID_LEN /* max SSID */ + 2 + 8 /* max Supported Rates */ + 3 /* max DS params */ + 4 /* IBSS params */ + 5 /* Channel Switch Announcement */ + 2 + (IEEE80211_MAX_SUPP_RATES - 8) + 2 + sizeof(struct ieee80211_ht_cap) + 2 + sizeof(struct ieee80211_ht_operation) + 2 + sizeof(struct ieee80211_vht_cap) + 2 + sizeof(struct ieee80211_vht_operation) + ifibss->ie_len; presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL); if (!presp) return NULL; presp->head = (void *)(presp + 1); mgmt = (void *) presp->head; mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int); mgmt->u.beacon.timestamp = cpu_to_le64(tsf); mgmt->u.beacon.capab_info = cpu_to_le16(capability); pos = (u8 *)mgmt + offsetof(struct ieee80211_mgmt, u.beacon.variable); *pos++ = WLAN_EID_SSID; *pos++ = ifibss->ssid_len; memcpy(pos, ifibss->ssid, ifibss->ssid_len); pos += ifibss->ssid_len; sband = local->hw.wiphy->bands[chandef->chan->band]; rate_flags = ieee80211_chandef_rate_flags(chandef); rates_n = 0; if (have_higher_than_11mbit) *have_higher_than_11mbit = false; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (sband->bitrates[i].bitrate > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; rates |= BIT(i); rates_n++; } *pos++ = WLAN_EID_SUPP_RATES; *pos++ = min_t(int, 8, rates_n); for (ri = 0; ri < sband->n_bitrates; ri++) { int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5); u8 basic = 0; if (!(rates & BIT(ri))) continue; if (basic_rates & BIT(ri)) basic = 0x80; *pos++ = basic | (u8) rate; if (++rates_added == 8) { ri++; /* continue at next rate for EXT_SUPP_RATES */ break; } } if (sband->band == NL80211_BAND_2GHZ) { *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel( chandef->chan->center_freq); } *pos++ = WLAN_EID_IBSS_PARAMS; *pos++ = 2; /* FIX: set ATIM window based on scan results */ *pos++ = 0; *pos++ = 0; if (csa_settings) { *pos++ = WLAN_EID_CHANNEL_SWITCH; *pos++ = 3; *pos++ = csa_settings->block_tx ? 1 : 0; *pos++ = ieee80211_frequency_to_channel( csa_settings->chandef.chan->center_freq); presp->cntdwn_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; presp->cntdwn_current_counter = csa_settings->count; } /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */ if (rates_n > 8) { *pos++ = WLAN_EID_EXT_SUPP_RATES; *pos++ = rates_n - 8; for (; ri < sband->n_bitrates; ri++) { int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5); u8 basic = 0; if (!(rates & BIT(ri))) continue; if (basic_rates & BIT(ri)) basic = 0x80; *pos++ = basic | (u8) rate; } } if (ifibss->ie_len) { memcpy(pos, ifibss->ie, ifibss->ie_len); pos += ifibss->ie_len; } /* add HT capability and information IEs */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_5 && chandef->width != NL80211_CHAN_WIDTH_10 && sband->ht_cap.ht_supported) { struct ieee80211_sta_ht_cap ht_cap; memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); ieee80211_apply_htcap_overrides(sdata, &ht_cap); pos = ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); /* * Note: According to 802.11n-2009 9.13.3.1, HT Protection * field and RIFS Mode are reserved in IBSS mode, therefore * keep them at 0 */ pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap, chandef, 0, false); /* add VHT capability and information IEs */ if (chandef->width != NL80211_CHAN_WIDTH_20 && chandef->width != NL80211_CHAN_WIDTH_40 && sband->vht_cap.vht_supported) { pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); pos = ieee80211_ie_build_vht_oper(pos, &sband->vht_cap, chandef); } } if (local->hw.queues >= IEEE80211_NUM_ACS) pos = ieee80211_add_wmm_info_ie(pos, 0); /* U-APSD not in use */ presp->head_len = pos - presp->head; if (WARN_ON(presp->head_len > frame_len)) goto error; return presp; error: kfree(presp); return NULL; } static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const int beacon_int, struct cfg80211_chan_def *req_chandef, const u32 basic_rates, const u16 capability, u64 tsf, bool creator) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; u64 bss_change; struct ieee80211_chan_req chanreq = {}; struct ieee80211_channel *chan; struct beacon_data *presp; struct cfg80211_inform_bss bss_meta = {}; bool have_higher_than_11mbit; bool radar_required; int err; lockdep_assert_wiphy(local->hw.wiphy); /* Reset own TSF to allow time synchronization work. */ drv_reset_tsf(local, sdata); if (!ether_addr_equal(ifibss->bssid, bssid)) sta_info_flush(sdata, -1); /* if merging, indicate to driver that we leave the old IBSS */ if (sdata->vif.cfg.ibss_joined) { sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; netif_carrier_off(sdata->dev); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS | BSS_CHANGED_BEACON_ENABLED); drv_leave_ibss(local, sdata); } presp = sdata_dereference(ifibss->presp, sdata); RCU_INIT_POINTER(ifibss->presp, NULL); if (presp) kfree_rcu(presp, rcu_head); /* make a copy of the chandef, it could be modified below. */ chanreq.oper = *req_chandef; chan = chanreq.oper.chan; if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper, NL80211_IFTYPE_ADHOC)) { if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || chanreq.oper.width == NL80211_CHAN_WIDTH_10 || chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || chanreq.oper.width == NL80211_CHAN_WIDTH_20) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; } chanreq.oper.width = NL80211_CHAN_WIDTH_20; chanreq.oper.center_freq1 = chan->center_freq; /* check again for downgraded chandef */ if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper, NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; } } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &chanreq.oper, NL80211_IFTYPE_ADHOC); if (err < 0) { sdata_info(sdata, "Failed to join IBSS, invalid chandef\n"); return; } if (err > 0 && !ifibss->userspace_handles_dfs) { sdata_info(sdata, "Failed to join IBSS, DFS channel without control program\n"); return; } radar_required = err; if (ieee80211_link_use_channel(&sdata->deflink, &chanreq, ifibss->fixed_channel ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE)) { sdata_info(sdata, "Failed to join IBSS, no channel context\n"); return; } sdata->deflink.radar_required = radar_required; memcpy(ifibss->bssid, bssid, ETH_ALEN); presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates, capability, tsf, &chanreq.oper, &have_higher_than_11mbit, NULL); if (!presp) return; rcu_assign_pointer(ifibss->presp, presp); mgmt = (void *)presp->head; sdata->vif.bss_conf.enable_beacon = true; sdata->vif.bss_conf.beacon_int = beacon_int; sdata->vif.bss_conf.basic_rates = basic_rates; sdata->vif.cfg.ssid_len = ifibss->ssid_len; memcpy(sdata->vif.cfg.ssid, ifibss->ssid, ifibss->ssid_len); bss_change = BSS_CHANGED_BEACON_INT; bss_change |= ieee80211_reset_erp_info(sdata); bss_change |= BSS_CHANGED_BSSID; bss_change |= BSS_CHANGED_BEACON; bss_change |= BSS_CHANGED_BEACON_ENABLED; bss_change |= BSS_CHANGED_BASIC_RATES; bss_change |= BSS_CHANGED_HT; bss_change |= BSS_CHANGED_IBSS; bss_change |= BSS_CHANGED_SSID; /* * In 5 GHz/802.11a, we can always use short slot time. * (IEEE 802.11-2012 18.3.8.7) * * In 2.4GHz, we must always use long slots in IBSS for compatibility * reasons. * (IEEE 802.11-2012 19.4.5) * * HT follows these specifications (IEEE 802.11-2012 20.3.18) */ sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ; bss_change |= BSS_CHANGED_ERP_SLOT; /* cf. IEEE 802.11 9.2.12 */ sdata->deflink.operating_11g_mode = chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit; ieee80211_set_wmm_default(&sdata->deflink, true, false); sdata->vif.cfg.ibss_joined = true; sdata->vif.cfg.ibss_creator = creator; err = drv_join_ibss(local, sdata); if (err) { sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; sdata->vif.cfg.ssid_len = 0; RCU_INIT_POINTER(ifibss->presp, NULL); kfree_rcu(presp, rcu_head); ieee80211_link_release_channel(&sdata->deflink); sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n", err); return; } ieee80211_bss_info_change_notify(sdata, bss_change); ifibss->state = IEEE80211_IBSS_MLME_JOINED; mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); bss_meta.chan = chan; bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, presp->head_len, GFP_KERNEL); cfg80211_put_bss(local->hw.wiphy, bss); netif_carrier_on(sdata->dev); cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL); } static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss) { struct cfg80211_bss *cbss = container_of((void *)bss, struct cfg80211_bss, priv); struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; u32 basic_rates; int i, j; u16 beacon_int = cbss->beacon_interval; const struct cfg80211_bss_ies *ies; enum nl80211_channel_type chan_type; u64 tsf; u32 rate_flags; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (beacon_int < 10) beacon_int = 10; switch (sdata->u.ibss.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef); cfg80211_chandef_create(&chandef, cbss->channel, chan_type); break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: cfg80211_chandef_create(&chandef, cbss->channel, NL80211_CHAN_NO_HT); chandef.width = sdata->u.ibss.chandef.width; break; case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: chandef = sdata->u.ibss.chandef; chandef.chan = cbss->channel; break; default: /* fall back to 20 MHz for unsupported modes */ cfg80211_chandef_create(&chandef, cbss->channel, NL80211_CHAN_NO_HT); break; } sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef); basic_rates = 0; for (i = 0; i < bss->supp_rates_len; i++) { int rate = bss->supp_rates[i] & 0x7f; bool is_basic = !!(bss->supp_rates[i] & 0x80); for (j = 0; j < sband->n_bitrates; j++) { int brate; if ((rate_flags & sband->bitrates[j].flags) != rate_flags) continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5); if (brate == rate) { if (is_basic) basic_rates |= BIT(j); break; } } } rcu_read_lock(); ies = rcu_dereference(cbss->ies); tsf = ies->tsf; rcu_read_unlock(); __ieee80211_sta_join_ibss(sdata, cbss->bssid, beacon_int, &chandef, basic_rates, cbss->capability, tsf, false); } int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings, u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct beacon_data *presp, *old_presp; struct cfg80211_bss *cbss; const struct cfg80211_bss_ies *ies; u16 capability = WLAN_CAPABILITY_IBSS; u64 tsf; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifibss->privacy) capability |= WLAN_CAPABILITY_PRIVACY; cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (unlikely(!cbss)) return -EINVAL; rcu_read_lock(); ies = rcu_dereference(cbss->ies); tsf = ies->tsf; rcu_read_unlock(); cfg80211_put_bss(sdata->local->hw.wiphy, cbss); old_presp = sdata_dereference(ifibss->presp, sdata); presp = ieee80211_ibss_build_presp(sdata, sdata->vif.bss_conf.beacon_int, sdata->vif.bss_conf.basic_rates, capability, tsf, &ifibss->chandef, NULL, csa_settings); if (!presp) return -ENOMEM; rcu_assign_pointer(ifibss->presp, presp); if (old_presp) kfree_rcu(old_presp, rcu_head); *changed |= BSS_CHANGED_BEACON; return 0; } int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* When not connected/joined, sending CSA doesn't make sense. */ if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) return -ENOLINK; /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); /* XXX: should not really modify cfg80211 data */ if (cbss) { cbss->channel = sdata->deflink.csa.chanreq.oper.chan; cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } ifibss->chandef = sdata->deflink.csa.chanreq.oper; /* generate the beacon */ return ieee80211_ibss_csa_beacon(sdata, NULL, changed); } void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; wiphy_work_cancel(sdata->local->hw.wiphy, &ifibss->csa_connection_drop_work); } static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta) __acquires(RCU) { struct ieee80211_sub_if_data *sdata = sta->sdata; u8 addr[ETH_ALEN]; memcpy(addr, sta->sta.addr, ETH_ALEN); ibss_dbg(sdata, "Adding new IBSS station %pM\n", addr); sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); /* authorize the station only if the network is not RSN protected. If * not wait for the userspace to authorize it */ if (!sta->sdata->u.ibss.control_port) sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); rate_control_rate_init(&sta->deflink); /* If it fails, maybe we raced another insertion? */ if (sta_info_insert_rcu(sta)) return sta_info_get(sdata, addr); return sta; } static struct sta_info * ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates) __acquires(RCU) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; int band; /* * XXX: Consider removing the least recently used entry and * allow new one to be added. */ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n", sdata->name, addr); rcu_read_lock(); return NULL; } if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) { rcu_read_lock(); return NULL; } if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) { rcu_read_lock(); return NULL; } rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON_ONCE(!chanctx_conf)) return NULL; band = chanctx_conf->def.chan->band; rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_KERNEL); if (!sta) { rcu_read_lock(); return NULL; } /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband); return ieee80211_ibss_finish_sta(sta); } static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; int active = 0; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sta->sdata == sdata && time_is_after_jiffies(last_active + IEEE80211_IBSS_MERGE_INTERVAL)) { active++; break; } } rcu_read_unlock(); return active; } static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { cfg80211_unlink_bss(local->hw.wiphy, cbss); cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } ifibss->state = IEEE80211_IBSS_MLME_SEARCH; sta_info_flush(sdata, -1); spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { sta = list_first_entry(&ifibss->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifibss->incomplete_lock); sta_info_free(local, sta); spin_lock_bh(&ifibss->incomplete_lock); } spin_unlock_bh(&ifibss->incomplete_lock); netif_carrier_off(sdata->dev); sdata->vif.cfg.ibss_joined = false; sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; sdata->vif.cfg.ssid_len = 0; /* remove beacon */ presp = sdata_dereference(ifibss->presp, sdata); RCU_INIT_POINTER(sdata->u.ibss.presp, NULL); if (presp) kfree_rcu(presp, rcu_head); clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_IBSS); drv_leave_ibss(local, sdata); ieee80211_link_release_channel(&sdata->deflink); } static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.ibss.csa_connection_drop_work); ieee80211_ibss_disconnect(sdata); synchronize_rcu(); skb_queue_purge(&sdata->skb_queue); /* trigger a scan to find another IBSS network to join */ wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; int err; /* if the current channel is a DFS channel, mark the channel as * unavailable. */ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &ifibss->chandef, NL80211_IFTYPE_ADHOC); if (err > 0) cfg80211_radar_event(sdata->local->hw.wiphy, &ifibss->chandef, GFP_ATOMIC); } static bool ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, bool beacon) { struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; enum nl80211_channel_type ch_type; int err; struct ieee80211_conn_settings conn = { .mode = IEEE80211_CONN_MODE_HT, .bw_limit = IEEE80211_CONN_BW_LIMIT_40, }; u32 vht_cap_info = 0; lockdep_assert_wiphy(sdata->local->hw.wiphy); switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: conn.mode = IEEE80211_CONN_MODE_LEGACY; fallthrough; case NL80211_CHAN_WIDTH_20: conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; default: break; } if (elems->vht_cap_elem) vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); memset(&params, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, vht_cap_info, &conn, ifibss->bssid, false, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) goto disconnect; /* did not contain a CSA */ if (err) return false; /* channel switch is not supported, disconnect */ if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)) goto disconnect; params.count = csa_ie.count; params.chandef = csa_ie.chanreq.oper; switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: /* keep our current HT mode (HT20/HT40+/HT40-), even if * another mode has been announced. The mode is not adopted * within the beacon while doing CSA and we should therefore * keep the mode which we announce. */ ch_type = cfg80211_get_chandef_type(&ifibss->chandef); cfg80211_chandef_create(&params.chandef, params.chandef.chan, ch_type); break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: if (params.chandef.width != ifibss->chandef.width) { sdata_info(sdata, "IBSS %pM received channel switch from incompatible channel width (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", ifibss->bssid, params.chandef.chan->center_freq, params.chandef.width, params.chandef.center_freq1, params.chandef.center_freq2); goto disconnect; } break; default: /* should not happen, conn_flags should prevent VHT modes. */ WARN_ON(1); goto disconnect; } if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, &params.chandef, NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "IBSS %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", ifibss->bssid, params.chandef.chan->center_freq, params.chandef.width, params.chandef.center_freq1, params.chandef.center_freq2); goto disconnect; } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, &params.chandef, NL80211_IFTYPE_ADHOC); if (err < 0) goto disconnect; if (err > 0 && !ifibss->userspace_handles_dfs) { /* IBSS-DFS only allowed with a control program */ goto disconnect; } params.radar_required = err; if (cfg80211_chandef_identical(&params.chandef, &sdata->vif.bss_conf.chanreq.oper)) { ibss_dbg(sdata, "received csa with an identical chandef, ignoring\n"); return true; } /* all checks done, now perform the channel switch. */ ibss_dbg(sdata, "received channel switch announcement to go to channel %d MHz\n", params.chandef.chan->center_freq); params.block_tx = !!csa_ie.mode; if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev, &params)) goto disconnect; ieee80211_ibss_csa_mark_radar(sdata); return true; disconnect: ibss_dbg(sdata, "Can't handle channel switch, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifibss->csa_connection_drop_work); ieee80211_ibss_csa_mark_radar(sdata); return true; } static void ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { int required_len; if (len < IEEE80211_MIN_ACTION_SIZE + 1) return; /* CSA is the only action we handle for now */ if (mgmt->u.action.u.measurement.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; required_len = IEEE80211_MIN_ACTION_SIZE + sizeof(mgmt->u.action.u.chan_switch); if (len < required_len) return; if (!sdata->vif.bss_conf.csa_active) ieee80211_ibss_process_chanswitch(sdata, elems, false); } static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u16 reason = le16_to_cpu(mgmt->u.deauth.reason_code); if (len < IEEE80211_DEAUTH_FRAME_LEN) return; ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (reason: %d)\n", mgmt->bssid, reason); sta_info_destroy_addr(sdata, mgmt->sa); } static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u16 auth_alg, auth_transaction; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 6) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); ibss_dbg(sdata, "RX Auth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (auth_transaction=%d)\n", mgmt->bssid, auth_transaction); if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1) return; /* * IEEE 802.11 standard does not require authentication in IBSS * networks and most implementations do not seem to use it. * However, try to reply to authentication attempts if someone * has actually implemented this. */ ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, 0, NULL, 0, mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0); } static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems, struct ieee80211_channel *channel) { struct sta_info *sta; enum nl80211_band band = rx_status->band; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; bool rates_updated = false; u32 supp_rates = 0; if (sdata->vif.type != NL80211_IFTYPE_ADHOC) return; if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid)) return; sband = local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return; rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); if (elems->supp_rates) { supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL); if (sta) { u32 prev_rates; prev_rates = sta->sta.deflink.supp_rates[band]; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband); if (sta->sta.deflink.supp_rates[band] != prev_rates) { ibss_dbg(sdata, "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", sta->sta.addr, prev_rates, sta->sta.deflink.supp_rates[band]); rates_updated = true; } } else { rcu_read_unlock(); sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); } } if (sta && !sta->sta.wme && (elems->wmm_info || elems->s1g_capab) && local->hw.queues >= IEEE80211_NUM_ACS) { sta->sta.wme = true; ieee80211_check_fast_xmit(sta); } if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) { /* we both use HT */ struct ieee80211_ht_cap htcap_ie; struct cfg80211_chan_def chandef; enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth; cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(elems->ht_operation, &chandef); memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, &htcap_ie, &sta->deflink); if (elems->vht_operation && elems->vht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) { /* we both use VHT */ struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.deflink.vht_cap; u32 vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); ieee80211_chandef_vht_oper(&local->hw, vht_cap_info, elems->vht_operation, elems->ht_operation, &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, &cap_ie, NULL, &sta->deflink); if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap))) rates_updated |= true; } if (bw != sta->sta.deflink.bandwidth) rates_updated |= true; if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef, &chandef)) WARN_ON_ONCE(1); } if (sta && rates_updated) { u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED; u8 rx_nss = sta->sta.deflink.rx_nss; /* Force rx_nss recalculation */ sta->sta.deflink.rx_nss = 0; rate_control_rate_init(&sta->deflink); if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; drv_link_sta_rc_update(local, sdata, &sta->sta.deflink, changed); } rcu_read_unlock(); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; struct ieee80211_channel *channel; u64 beacon_timestamp, rx_timestamp; u32 supp_rates = 0; enum nl80211_band band = rx_status->band; channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); if (!channel) return; ieee80211_update_sta_info(sdata, mgmt, len, rx_status, elems, channel); bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel); if (!bss) return; cbss = container_of((void *)bss, struct cfg80211_bss, priv); /* same for beacon and probe response */ beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp); /* check if we need to merge IBSS */ /* not an IBSS */ if (!(cbss->capability & WLAN_CAPABILITY_IBSS)) goto put_bss; /* different channel */ if (sdata->u.ibss.fixed_channel && sdata->u.ibss.chandef.chan != cbss->channel) goto put_bss; /* different SSID */ if (elems->ssid_len != sdata->u.ibss.ssid_len || memcmp(elems->ssid, sdata->u.ibss.ssid, sdata->u.ibss.ssid_len)) goto put_bss; /* process channel switch */ if (sdata->vif.bss_conf.csa_active || ieee80211_ibss_process_chanswitch(sdata, elems, true)) goto put_bss; /* same BSSID */ if (ether_addr_equal(cbss->bssid, sdata->u.ibss.bssid)) goto put_bss; /* we use a fixed BSSID */ if (sdata->u.ibss.fixed_bssid) goto put_bss; if (ieee80211_have_rx_timestamp(rx_status)) { /* time when timestamp field was received */ rx_timestamp = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); } else { /* * second best option: get current TSF * (will return -1 if not supported) */ rx_timestamp = drv_get_tsf(local, sdata); } ibss_dbg(sdata, "RX beacon SA=%pM BSSID=%pM TSF=0x%llx\n", mgmt->sa, mgmt->bssid, (unsigned long long)rx_timestamp); ibss_dbg(sdata, "\tBCN=0x%llx diff=%lld @%lu\n", (unsigned long long)beacon_timestamp, (unsigned long long)(rx_timestamp - beacon_timestamp), jiffies); if (beacon_timestamp > rx_timestamp) { ibss_dbg(sdata, "beacon TSF higher than local TSF - IBSS merge with BSSID %pM\n", mgmt->bssid); ieee80211_sta_join_ibss(sdata, bss); supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL); ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); rcu_read_unlock(); } put_bss: ieee80211_rx_bss_put(local, bss); } void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; int band; /* * XXX: Consider removing the least recently used entry and * allow new one to be added. */ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n", sdata->name, addr); return; } if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) return; if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON_ONCE(!chanctx_conf)) { rcu_read_unlock(); return; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); if (!sta) return; /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | ieee80211_mandatory_rates(sband); spin_lock(&ifibss->incomplete_lock); list_add(&sta->list, &ifibss->incomplete_stations); spin_unlock(&ifibss->incomplete_lock); wiphy_work_queue(local->hw.wiphy, &sdata->work); } static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT; unsigned long exp_rsn = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time) || (time_is_before_jiffies(last_active + exp_rsn) && sta->sta_state != IEEE80211_STA_AUTHORIZED)) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n", sta->sta_state != IEEE80211_STA_AUTHORIZED ? "not authorized " : "", sta->sta.addr); ieee80211_send_deauth_disassoc(sdata, sta->sta.addr, ifibss->bssid, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, frame_buf); WARN_ON(__sta_info_destroy(sta)); } } } /* * This function is called with state == IEEE80211_IBSS_MLME_JOINED */ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; lockdep_assert_wiphy(sdata->local->hw.wiphy); mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); ieee80211_ibss_sta_expire(sdata); if (time_before(jiffies, ifibss->last_scan_completed + IEEE80211_IBSS_MERGE_INTERVAL)) return; if (ieee80211_sta_active_ibss(sdata)) return; if (ifibss->fixed_channel) return; sdata_info(sdata, "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n"); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, NULL, 0); } static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; u8 bssid[ETH_ALEN]; u16 capability; int i; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifibss->fixed_bssid) { memcpy(bssid, ifibss->bssid, ETH_ALEN); } else { /* Generate random, not broadcast, locally administered BSSID. Mix in * own MAC address to make sure that devices that do not have proper * random number generator get different BSSID. */ get_random_bytes(bssid, ETH_ALEN); for (i = 0; i < ETH_ALEN; i++) bssid[i] ^= sdata->vif.addr[i]; bssid[0] &= ~0x01; bssid[0] |= 0x02; } sdata_info(sdata, "Creating new IBSS network, BSSID %pM\n", bssid); capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) capability |= WLAN_CAPABILITY_PRIVACY; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, &ifibss->chandef, ifibss->basic_rates, capability, 0, true); } static unsigned int ibss_setup_channels(struct wiphy *wiphy, struct ieee80211_channel **channels, unsigned int channels_max, u32 center_freq, u32 width) { struct ieee80211_channel *chan = NULL; unsigned int n_chan = 0; u32 start_freq, end_freq, freq; if (width <= 20) { start_freq = center_freq; end_freq = center_freq; } else { start_freq = center_freq - width / 2 + 10; end_freq = center_freq + width / 2 - 10; } for (freq = start_freq; freq <= end_freq; freq += 20) { chan = ieee80211_get_channel(wiphy, freq); if (!chan) continue; if (n_chan >= channels_max) return n_chan; channels[n_chan] = chan; n_chan++; } return n_chan; } static unsigned int ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, struct ieee80211_channel **channels, unsigned int channels_max) { unsigned int n_chan = 0; u32 width, cf1, cf2 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_40: width = 40; break; case NL80211_CHAN_WIDTH_80P80: cf2 = chandef->center_freq2; fallthrough; case NL80211_CHAN_WIDTH_80: width = 80; break; case NL80211_CHAN_WIDTH_160: width = 160; break; default: width = 20; break; } cf1 = chandef->center_freq1; n_chan = ibss_setup_channels(wiphy, channels, channels_max, cf1, width); if (cf2) n_chan += ibss_setup_channels(wiphy, &channels[n_chan], channels_max - n_chan, cf2, width); return n_chan; } /* * This function is called with state == IEEE80211_IBSS_MLME_SEARCH */ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; struct cfg80211_bss *cbss; struct ieee80211_channel *chan = NULL; const u8 *bssid = NULL; int active_ibss; lockdep_assert_wiphy(sdata->local->hw.wiphy); active_ibss = ieee80211_sta_active_ibss(sdata); ibss_dbg(sdata, "sta_find_ibss (active_ibss=%d)\n", active_ibss); if (active_ibss) return; if (ifibss->fixed_bssid) bssid = ifibss->bssid; if (ifibss->fixed_channel) chan = ifibss->chandef.chan; if (!is_zero_ether_addr(ifibss->bssid)) bssid = ifibss->bssid; cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid, ifibss->ssid, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { struct ieee80211_bss *bss; bss = (void *)cbss->priv; ibss_dbg(sdata, "sta_find_ibss: selected %pM current %pM\n", cbss->bssid, ifibss->bssid); sdata_info(sdata, "Selected IBSS BSSID %pM based on configured SSID\n", cbss->bssid); ieee80211_sta_join_ibss(sdata, bss); ieee80211_rx_bss_put(local, bss); return; } /* if a fixed bssid and a fixed freq have been provided create the IBSS * directly and do not waste time scanning */ if (ifibss->fixed_bssid && ifibss->fixed_channel) { sdata_info(sdata, "Created IBSS using preconfigured BSSID %pM\n", bssid); ieee80211_sta_create_ibss(sdata); return; } ibss_dbg(sdata, "sta_find_ibss: did not try to join ibss\n"); /* Selected IBSS not found in current scan results - try to scan */ if (time_after(jiffies, ifibss->last_scan_completed + IEEE80211_SCAN_INTERVAL)) { struct ieee80211_channel *channels[8]; unsigned int num; sdata_info(sdata, "Trigger new scan to find an IBSS to join\n"); if (ifibss->fixed_channel) { num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, &ifibss->chandef, channels, ARRAY_SIZE(channels)); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, channels, num); } else { ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, NULL, 0); } } else { int interval = IEEE80211_SCAN_INTERVAL; if (time_after(jiffies, ifibss->ibss_join_req + IEEE80211_IBSS_JOIN_TIMEOUT)) ieee80211_sta_create_ibss(sdata); mod_timer(&ifibss->timer, round_jiffies(jiffies + interval)); } } static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *req) { struct ieee80211_mgmt *mgmt = (void *)req->data; struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_local *local = sdata->local; int tx_last_beacon, len = req->len; struct sk_buff *skb; struct beacon_data *presp; u8 *pos, *end; lockdep_assert_wiphy(sdata->local->hw.wiphy); presp = sdata_dereference(ifibss->presp, sdata); if (ifibss->state != IEEE80211_IBSS_MLME_JOINED || len < 24 + 2 || !presp) return; tx_last_beacon = drv_tx_last_beacon(local); ibss_dbg(sdata, "RX ProbeReq SA=%pM DA=%pM\n", mgmt->sa, mgmt->da); ibss_dbg(sdata, "\tBSSID=%pM (tx_last_beacon=%d)\n", mgmt->bssid, tx_last_beacon); if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da)) return; if (!ether_addr_equal(mgmt->bssid, ifibss->bssid) && !is_broadcast_ether_addr(mgmt->bssid)) return; end = ((u8 *) mgmt) + len; pos = mgmt->u.probe_req.variable; if (pos[0] != WLAN_EID_SSID || pos + 2 + pos[1] > end) { ibss_dbg(sdata, "Invalid SSID IE in ProbeReq from %pM\n", mgmt->sa); return; } if (pos[1] != 0 && (pos[1] != ifibss->ssid_len || memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) { /* Ignore ProbeReq for foreign SSID */ return; } /* Reply with ProbeResp */ skb = dev_alloc_skb(local->tx_headroom + presp->head_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); skb_put_data(skb, presp->head, presp->head_len); memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN); ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; /* avoid excessive retries for probe request to wildcard SSIDs */ if (pos[1] == 0) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_NO_ACK; ieee80211_tx_skb(sdata, skb); } static void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { size_t baselen; struct ieee802_11_elems *elems; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); /* * either beacon or probe_resp but the variable field is at the * same offset */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, false, NULL); if (elems) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); kfree(elems); } } void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status; struct ieee80211_mgmt *mgmt; u16 fc; struct ieee802_11_elems *elems; int ies_len; rx_status = IEEE80211_SKB_RXCB(skb); mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); if (!sdata->u.ibss.ssid_len) return; /* not ready to merge yet */ switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_PROBE_REQ: ieee80211_rx_mgmt_probe_req(sdata, skb); break; case IEEE80211_STYPE_PROBE_RESP: case IEEE80211_STYPE_BEACON: ieee80211_rx_mgmt_probe_beacon(sdata, mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth_ibss(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ACTION: switch (mgmt->u.action.category) { case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); if (ies_len < 0) break; elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, skb->len, rx_status, elems); kfree(elems); break; } } } void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct sta_info *sta; /* * Work could be scheduled after scan or similar * when we aren't even joined (or trying) with a * network. */ if (!ifibss->ssid_len) return; spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { sta = list_first_entry(&ifibss->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifibss->incomplete_lock); ieee80211_ibss_finish_sta(sta); rcu_read_unlock(); spin_lock_bh(&ifibss->incomplete_lock); } spin_unlock_bh(&ifibss->incomplete_lock); switch (ifibss->state) { case IEEE80211_IBSS_MLME_SEARCH: ieee80211_sta_find_ibss(sdata); break; case IEEE80211_IBSS_MLME_JOINED: ieee80211_sta_merge_ibss(sdata); break; default: WARN_ON(1); break; } } static void ieee80211_ibss_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.ibss.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0); INIT_LIST_HEAD(&ifibss->incomplete_stations); spin_lock_init(&ifibss->incomplete_lock); wiphy_work_init(&ifibss->csa_connection_drop_work, ieee80211_csa_connection_drop_work); } /* scan finished notification */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type != NL80211_IFTYPE_ADHOC) continue; sdata->u.ibss.last_scan_completed = jiffies; } } int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { u64 changed = 0; u32 rate_flags; struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; struct ieee80211_local *local = sdata->local; int radar_detect_width = 0; int i; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ return -EOPNOTSUPP; } ret = cfg80211_chandef_dfs_required(local->hw.wiphy, &params->chandef, sdata->wdev.iftype); if (ret < 0) return ret; if (ret > 0) { if (!params->userspace_handles_dfs) return -EINVAL; radar_detect_width = BIT(params->chandef.width); } chanmode = (params->channel_fixed && !ret) ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE; ret = ieee80211_check_combinations(sdata, &params->chandef, chanmode, radar_detect_width, -1); if (ret < 0) return ret; if (params->bssid) { memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN); sdata->u.ibss.fixed_bssid = true; } else sdata->u.ibss.fixed_bssid = false; sdata->u.ibss.privacy = params->privacy; sdata->u.ibss.control_port = params->control_port; sdata->u.ibss.userspace_handles_dfs = params->userspace_handles_dfs; sdata->u.ibss.basic_rates = params->basic_rates; sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ rate_flags = ieee80211_chandef_rate_flags(&params->chandef); sband = local->hw.wiphy->bands[params->chandef.chan->band]; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) sdata->u.ibss.basic_rates &= ~BIT(i); } memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, sizeof(params->mcast_rate)); sdata->vif.bss_conf.beacon_int = params->beacon_interval; sdata->u.ibss.chandef = params->chandef; sdata->u.ibss.fixed_channel = params->channel_fixed; if (params->ie) { sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len, GFP_KERNEL); if (sdata->u.ibss.ie) sdata->u.ibss.ie_len = params->ie_len; } sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; sdata->u.ibss.ibss_join_req = jiffies; memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len); sdata->u.ibss.ssid_len = params->ssid_len; memcpy(&sdata->u.ibss.ht_capa, &params->ht_capa, sizeof(sdata->u.ibss.ht_capa)); memcpy(&sdata->u.ibss.ht_capa_mask, &params->ht_capa_mask, sizeof(sdata->u.ibss.ht_capa_mask)); /* * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is * reserved, but an HT STA shall protect HT transmissions as though * the HT Protection field were set to non-HT mixed mode. * * In an IBSS, the RIFS Mode field of the HT Operation element is * also reserved, but an HT STA shall operate as though this field * were set to 1. */ sdata->vif.bss_conf.ht_operation_mode |= IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | IEEE80211_HT_PARAM_RIFS_MODE; changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE; ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = local->rx_chains; sdata->control_port_over_nl80211 = params->control_port_over_nl80211; wiphy_work_queue(local->hw.wiphy, &sdata->work); return 0; } int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; ieee80211_ibss_disconnect(sdata); ifibss->ssid_len = 0; eth_zero_addr(ifibss->bssid); /* remove beacon */ kfree(sdata->u.ibss.ie); sdata->u.ibss.ie = NULL; sdata->u.ibss.ie_len = 0; /* on the next join, re-program HT parameters */ memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa)); memset(&ifibss->ht_capa_mask, 0, sizeof(ifibss->ht_capa_mask)); synchronize_rcu(); skb_queue_purge(&sdata->skb_queue); del_timer_sync(&sdata->u.ibss.timer); return 0; }
2 2 2 11 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 /* SPDX-License-Identifier: GPL-2.0 */ /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XSK_QUEUE_H #define _LINUX_XSK_QUEUE_H #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; u32 pad2 ____cacheline_aligned_in_smp; u32 flags; u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ struct xdp_rxtx_ring { struct xdp_ring ptrs; struct xdp_desc desc[] ____cacheline_aligned_in_smp; }; /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; u64 desc[] ____cacheline_aligned_in_smp; }; struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; }; struct parsed_desc { u32 mb; u32 valid; }; /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and * completion ring, the kernel is the producer and user space is the * consumer. For the Tx and fill rings, the kernel is the consumer and * user space is the producer. * * producer consumer * * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). * * Starting with (B), it protects the data from being written after * the producer pointer. If this barrier was missing, the consumer * could observe the producer pointer being set and thus load the data * before the producer has written the new data. The consumer would in * this case load the old data. * * (C) protects the consumer from speculatively loading the data before * the producer pointer actually has been read. If we do not have this * barrier, some architectures could load old data as speculative loads * are not discarded as the CPU does not know there is a dependency * between ->producer and data. * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no * room in the buffer to store $data we do not. The dependency will * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory * barrier, the producer could observe the consumer pointer being set * and overwrite the data with a new value before the consumer got the * chance to read the old value. The consumer would thus miss reading * the old entry and very likely read the new entry twice, once right * now and again after circling through the ring. */ /* The operations on the rings are the following: * * producer consumer * * RESERVE entries PEEK in the ring for entries * WRITE data into the ring READ data from the ring * SUBMIT entries RELEASE entries * * The producer reserves one or more entries in the ring. It can then * fill in these entries and finally submit them so that they can be * seen and read by the consumer. * * The consumer peeks into the ring to see if the producer has written * any new entries. If so, the consumer can then read these entries * and when it is done reading them release them back to the producer * so that the producer can use these slots to fill in new entries. * * The function names below reflect these operations. */ /* Functions that read and validate content from consumer rings. */ static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; *addr = ring->desc[idx]; } static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_cons != q->cached_prod) { __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } return false; } static inline bool xp_unused_options_set(u32 options) { return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = desc->addr - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; if (offset + len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; if (len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : xp_aligned_validate_desc(pool, desc); } static inline bool xskq_has_descs(struct xsk_queue *q) { return q->cached_cons != q->cached_prod; } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) { if (!xp_validate_desc(pool, d)) { q->invalid_descs++; return false; } return true; } static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; return xskq_cons_is_valid_desc(q, desc, pool); } q->queue_empty_descs++; return false; } static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) { q->cached_cons += cnt; } static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, struct xdp_desc *desc, struct parsed_desc *parsed) { parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); parsed->mb = xp_mb_desc(desc); } static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; u32 total_descs = 0, nr_frags = 0; /* track first entry, if stumble upon *any* invalid descriptor, rewind * current packet that consists of frags and stop the processing */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; cached_cons++; parse_desc(q, pool, &descs[nb_entries], &parsed); if (unlikely(!parsed.valid)) break; if (likely(!parsed.mb)) { total_descs += (nr_frags + 1); nr_frags = 0; } else { nr_frags++; if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } } nb_entries++; } cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); return total_descs; } /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) { smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) { __xskq_cons_release(q); __xskq_cons_peek(q); } static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; if (entries >= max) return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; return entries >= max ? max : entries; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_desc(q, desc, pool); } /* To improve performance in the xskq_cons_release functions, only update local state here. * Reflect this to global state when we get new entries from the ring in * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. */ static inline void xskq_cons_release(struct xsk_queue *q) { q->cached_cons++; } static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_cons -= cnt; } static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); } /* Functions for producers */ static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries >= max) return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); return free_entries >= max ? max : free_entries; } static inline bool xskq_prod_is_full(struct xsk_queue *q) { return xskq_prod_nb_free(q, 1) ? false : true; } static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ q->cached_prod++; return 0; } static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; if (xskq_prod_is_full(q)) return -ENOBUFS; /* A, matches D */ idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; ring->desc[idx].options = flags; return 0; } static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) { __xskq_prod_submit(q, q->cached_prod); } static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } /* For both producers and consumers */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) { return q ? q->invalid_descs : 0; } static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) { return q ? q->queue_empty_descs : 0; } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */
92 92 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/ptrace.c * * (C) Copyright 1999 Linus Torvalds * * Common interfaces for "ptrace()" which we do not want * to continually duplicate across every architecture. */ #include <linux/capability.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/minmax.h> #include <linux/syscall_user_dispatch.h> #include <asm/syscall.h> /* for syscall_get_* */ /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; if (!tsk->ptrace || (current != tsk->parent) || ((get_dumpable(mm) != SUID_DUMP_USER) && !ptracer_capable(tsk, mm->user_ns))) { mmput(mm); return 0; } ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred) { BUG_ON(!list_empty(&child->ptrace_entry)); list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; child->ptracer_cred = get_cred(ptracer_cred); } /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { __ptrace_link(child, new_parent, current_cred()); } /** * __ptrace_unlink - unlink ptracee and restore its execution state * @child: ptracee to be unlinked * * Remove @child from the ptrace list, move it back to the original parent, * and restore the execution state so that it conforms to the group stop * state. * * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer * exiting. For PTRACE_DETACH, unless the ptracee has been killed between * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. * If the ptracer is exiting, the ptracee can be in any state. * * After detach, the ptracee should be in a state which conforms to the * group stop. If the group is stopped or in the process of stopping, the * ptracee should be put into TASK_STOPPED; otherwise, it should be woken * up from TASK_TRACED. * * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, * it goes through TRACED -> RUNNING -> STOPPED transition which is similar * to but in the opposite direction of what happens while attaching to a * stopped task. However, in this direction, the intermediate RUNNING * state is not hidden even from the current ptracer and if it immediately * re-attaches and performs a WNOHANG wait(2), it may fail. * * CONTEXT: * write_lock_irq(tasklist_lock) */ void __ptrace_unlink(struct task_struct *child) { const struct cred *old_cred; BUG_ON(!child->ptrace); clear_task_syscall_work(child, SYSCALL_TRACE); #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; list_del_init(&child->ptrace_entry); old_cred = child->ptracer_cred; child->ptracer_cred = NULL; put_cred(old_cred); spin_lock(&child->sighand->siglock); child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. */ task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); task_clear_jobctl_trapping(child); /* * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child * is in TASK_TRACED; otherwise, we might unduly disrupt * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } static bool looks_like_a_spurious_pid(struct task_struct *task) { if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP)) return false; if (task_pid_vnr(task) == task->ptrace_message) return false; /* * The tracee changed its pid but the PTRACE_EVENT_EXEC event * was not wait()'ed, most probably debugger targets the old * leader which was destroyed in de_thread(). */ return true; } /* * Ensure that nothing can wake it up, even SIGKILL * * A task is switched to this state while a ptrace operation is in progress; * such that the ptrace operation is uninterruptible. */ static bool ptrace_freeze_traced(struct task_struct *task) { bool ret = false; /* Lockless, nobody but us can set this flag */ if (task->jobctl & JOBCTL_LISTENING) return ret; spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { task->jobctl |= JOBCTL_PTRACE_FROZEN; ret = true; } spin_unlock_irq(&task->sighand->siglock); return ret; } static void ptrace_unfreeze_traced(struct task_struct *task) { unsigned long flags; /* * The child may be awake and may have cleared * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew. */ if (lock_task_sighand(task, &flags)) { task->jobctl &= ~JOBCTL_PTRACE_FROZEN; if (__fatal_signal_pending(task)) { task->jobctl &= ~JOBCTL_TRACED; wake_up_state(task, __TASK_TRACED); } unlock_task_sighand(task, &flags); } } /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for * @ignore_state: don't check whether @child is currently %TASK_TRACED * * Check whether @child is being ptraced by %current and ready for further * ptrace operations. If @ignore_state is %false, @child also should be in * %TASK_TRACED state and on return the child is guaranteed to be traced * and not executing. If @ignore_state is %true, @child can be in any * state. * * CONTEXT: * Grabs and releases tasklist_lock and @child->sighand->siglock. * * RETURNS: * 0 on success, -ESRCH if %child is not ready. */ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; /* * We take the read lock around doing both checks to close a * possible race where someone else was tracing our child and * detached between these two checks. After this locked check, * we are sure that this is our traced child and that can only * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ if (ignore_state || ptrace_freeze_traced(child)) ret = 0; } read_unlock(&tasklist_lock); if (!ret && !ignore_state && WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) ret = -ESRCH; return ret; } static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { if (mode & PTRACE_MODE_NOAUDIT) return ns_capable_noaudit(ns, CAP_SYS_PTRACE); return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); return -EPERM; } /* May we inspect the given task? * This check is used both for attaching with ptrace * and for allowing access to sensitive information in /proc. * * ptrace_attach denies several cases that /proc allows * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); if (mode & PTRACE_MODE_FSCREDS) { caller_uid = cred->fsuid; caller_gid = cred->fsgid; } else { /* * Using the euid would make more sense here, but something * in userland might rely on the old behavior, and this * shouldn't be a security problem since * PTRACE_MODE_REALCREDS implies that the caller explicitly * used a syscall that requests access to another process * (and not a filesystem syscall to procfs). */ caller_uid = cred->uid; caller_gid = cred->gid; } tcred = __task_cred(task); if (uid_eq(caller_uid, tcred->euid) && uid_eq(caller_uid, tcred->suid) && uid_eq(caller_uid, tcred->uid) && gid_eq(caller_gid, tcred->egid) && gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; ok: rcu_read_unlock(); /* * If a task drops privileges and becomes nondumpable (through a syscall * like setresuid()) while we are trying to access it, we must ensure * that the dumpability is read after the credentials; otherwise, * we may be able to attach to a task that we shouldn't be able to * attach to (as if the task had dropped privileges without becoming * nondumpable). * Pairs with a write barrier in commit_creds(). */ smp_rmb(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; task_lock(task); err = __ptrace_may_access(task, mode); task_unlock(task); return !err; } static int check_ptrace_options(unsigned long data) { if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || !IS_ENABLED(CONFIG_SECCOMP)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED || current->ptrace & PT_SUSPEND_SECCOMP) return -EPERM; } return 0; } static inline void ptrace_set_stopped(struct task_struct *task, bool seize) { guard(spinlock)(&task->sighand->siglock); /* SEIZE doesn't trap tracee on attach */ if (!seize) send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID); /* * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait * for the transition to complete before returning from this * function. * * This hides STOPPED -> RUNNING -> TRACED transition from the * attaching thread but a different thread in the same group can * still observe the transient RUNNING state. IOW, if another * thread's WNOHANG wait(2) on the stopped tracee races against * ATTACH, the wait(2) may fail due to the transient RUNNING. * * The following task_is_stopped() test is safe as both transitions * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); } } static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); int retval; if (seize) { if (addr != 0) return -EIO; /* * This duplicates the check in check_ptrace_options() because * ptrace_attach() and ptrace_setoptions() have historically * used different error codes for unknown ptrace options. */ if (flags & ~(unsigned long)PTRACE_O_MASK) return -EIO; retval = check_ptrace_options(flags); if (retval) return retval; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); } else { flags = PT_PTRACED; } audit_ptrace(task); if (unlikely(task->flags & PF_KTHREAD)) return -EPERM; if (same_thread_group(task, current)) return -EPERM; /* * Protect exec's credential calculations against our interference; * SUID, SGID and LSM creds get determined differently * under ptrace. */ scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, &task->signal->cred_guard_mutex) { scoped_guard (task_lock, task) { retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (retval) return retval; } scoped_guard (write_lock_irq, &tasklist_lock) { if (unlikely(task->exit_state)) return -EPERM; if (task->ptrace) return -EPERM; task->ptrace = flags; ptrace_link(task, current); ptrace_set_stopped(task, seize); } } /* * We do not bother to change retval or clear JOBCTL_TRAPPING * if wait_on_bit() was interrupted by SIGKILL. The tracer will * not return to user-mode, it will exit and clear this bit in * __ptrace_unlink() if it wasn't already cleared by the tracee; * and until then nobody can ptrace this task. */ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); return 0; } /** * ptrace_traceme -- helper for PTRACE_TRACEME * * Performs checks and sets PT_PTRACED. * Should be used by all ptrace implementations for PTRACE_TRACEME. */ static int ptrace_traceme(void) { int ret = -EPERM; write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { ret = security_ptrace_traceme(current->parent); /* * Check PF_EXITING to ensure ->real_parent has not passed * exit_ptrace(). Otherwise we don't report the error but * pretend ->real_parent untraces us right after return. */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); return ret; } /* * Called with irqs disabled, returns true if childs should reap themselves. */ static int ignoring_children(struct sighand_struct *sigh) { int ret; spin_lock(&sigh->siglock); ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); spin_unlock(&sigh->siglock); return ret; } /* * Called with tasklist_lock held for writing. * Unlink a traced task, and clean it up if it was a traced zombie. * Return true if it needs to be reaped with release_task(). * (We can't call release_task() here because we already hold tasklist_lock.) * * If it's a zombie, our attachedness prevented normal parent notification * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * * If it's our own child, there is no notification to do. But if our normal * children self-reap, then this child was prevented by ptrace and we must * reap it now, in that case we must also wake up sub-threads sleeping in * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { bool dead; __ptrace_unlink(p); if (p->exit_state != EXIT_ZOMBIE) return false; dead = !thread_group_leader(p); if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); else if (ignoring_children(tracer->sighand)) { __wake_up_parent(p, tracer); dead = true; } } /* Mark it as in the process of being reaped. */ if (dead) p->exit_state = EXIT_DEAD; return dead; } static int ptrace_detach(struct task_struct *child, unsigned int data) { if (!valid_signal(data)) return -EIO; /* Architecture-specific hardware disable .. */ ptrace_disable(child); write_lock_irq(&tasklist_lock); /* * We rely on ptrace_freeze_traced(). It can't be killed and * untraced by another thread, it can't be a zombie. */ WARN_ON(!child->ptrace || child->exit_state); /* * tasklist_lock avoids the race with wait_task_stopped(), see * the comment in ptrace_resume(). */ child->exit_code = data; __ptrace_detach(current, child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); return 0; } /* * Detach all tasks we were using ptrace on. Called with tasklist held * for writing. */ void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_PRIV, p); if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, dead); } } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; while (len > 0) { char buf[128]; int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; return -EIO; } if (copy_to_user(dst, buf, retval)) return -EFAULT; copied += retval; src += retval; dst += retval; len -= retval; } return copied; } int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) { int copied = 0; while (len > 0) { char buf[128]; int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; retval = ptrace_access_vm(tsk, dst, buf, this_len, FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; return -EIO; } copied += retval; src += retval; dst += retval; len -= retval; } return copied; } static int ptrace_setoptions(struct task_struct *child, unsigned long data) { unsigned flags; int ret; ret = check_ptrace_options(data); if (ret) return ret; /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); flags |= (data << PT_OPT_FLAG_SHIFT); child->ptrace = flags; return 0; } static int ptrace_getsiginfo(struct task_struct *child, kernel_siginfo_t *info) { unsigned long flags; int error = -ESRCH; if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { copy_siginfo(info, child->last_siginfo); error = 0; } unlock_task_sighand(child, &flags); } return error; } static int ptrace_setsiginfo(struct task_struct *child, const kernel_siginfo_t *info) { unsigned long flags; int error = -ESRCH; if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { copy_siginfo(child->last_siginfo, info); error = 0; } unlock_task_sighand(child, &flags); } return error; } static int ptrace_peek_siginfo(struct task_struct *child, unsigned long addr, unsigned long data) { struct ptrace_peeksiginfo_args arg; struct sigpending *pending; struct sigqueue *q; int ret, i; ret = copy_from_user(&arg, (void __user *) addr, sizeof(struct ptrace_peeksiginfo_args)); if (ret) return -EFAULT; if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) return -EINVAL; /* unknown flags */ if (arg.nr < 0) return -EINVAL; /* Ensure arg.off fits in an unsigned long */ if (arg.off > ULONG_MAX) return 0; if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) pending = &child->signal->shared_pending; else pending = &child->pending; for (i = 0; i < arg.nr; ) { kernel_siginfo_t info; unsigned long off = arg.off + i; bool found = false; spin_lock_irq(&child->sighand->siglock); list_for_each_entry(q, &pending->list, list) { if (!off--) { found = true; copy_siginfo(&info, &q->info); break; } } spin_unlock_irq(&child->sighand->siglock); if (!found) /* beyond the end of the list */ break; #ifdef CONFIG_COMPAT if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info)) { ret = -EFAULT; break; } } else #endif { siginfo_t __user *uinfo = (siginfo_t __user *) data; if (copy_siginfo_to_user(uinfo, &info)) { ret = -EFAULT; break; } } data += sizeof(siginfo_t); i++; if (signal_pending(current)) break; cond_resched(); } if (i > 0) return i; return ret; } #ifdef CONFIG_RSEQ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, .rseq_abi_size = task->rseq_len, .signature = task->rseq_sig, .flags = 0, }; size = min_t(unsigned long, size, sizeof(conf)); if (copy_to_user(data, &conf, size)) return -EFAULT; return sizeof(conf); } #endif #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) #else #define is_singleblock(request) 0 #endif #ifdef PTRACE_SYSEMU #define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) #else #define is_sysemu_singlestep(request) 0 #endif static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { if (!valid_signal(data)) return -EIO; if (request == PTRACE_SYSCALL) set_task_syscall_work(child, SYSCALL_TRACE); else clear_task_syscall_work(child, SYSCALL_TRACE); #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) set_task_syscall_work(child, SYSCALL_EMU); else clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { if (unlikely(!arch_has_block_step())) return -EIO; user_enable_block_step(child); } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { if (unlikely(!arch_has_single_step())) return -EIO; user_enable_single_step(child); } else { user_disable_single_step(child); } /* * Change ->exit_code and ->state under siglock to avoid the race * with wait_task_stopped() in between; a non-zero ->exit_code will * wrongly look like another report from tracee. * * Note that we need siglock even if ->exit_code == data and/or this * status was not reported yet, the new status must not be cleared by * wait_task_stopped() after resume. */ spin_lock_irq(&child->sighand->siglock); child->exit_code = data; child->jobctl &= ~JOBCTL_TRACED; wake_up_state(child, __TASK_TRACED); spin_unlock_irq(&child->sighand->siglock); return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static const struct user_regset * find_regset(const struct user_regset_view *view, unsigned int type) { const struct user_regset *regset; int n; for (n = 0; n < view->n; ++n) { regset = view->regsets + n; if (regset->core_note_type == type) return regset; } return NULL; } static int ptrace_regset(struct task_struct *task, int req, unsigned int type, struct iovec *kiov) { const struct user_regset_view *view = task_user_regset_view(task); const struct user_regset *regset = find_regset(view, type); int regset_no; if (!regset || (kiov->iov_len % regset->size) != 0) return -EINVAL; regset_no = regset - view->regsets; kiov->iov_len = min(kiov->iov_len, (__kernel_size_t) (regset->n * regset->size)); if (req == PTRACE_GETREGSET) return copy_regset_to_user(task, view, regset_no, 0, kiov->iov_len, kiov->iov_base); else return copy_regset_from_user(task, view, regset_no, 0, kiov->iov_len, kiov->iov_base); } /* * This is declared in linux/regset.h and defined in machine-dependent * code. We put the export here, near the primary machine-neutral use, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); static unsigned long ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) info->entry.args[i] = args[i]; /* args is the last field in struct ptrace_syscall_info.entry */ return offsetofend(struct ptrace_syscall_info, entry.args); } static unsigned long ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { /* * As struct ptrace_syscall_info.entry is currently a subset * of struct ptrace_syscall_info.seccomp, it makes sense to * initialize that subset using ptrace_get_syscall_info_entry(). * This can be reconsidered in the future if these structures * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) info->exit.rval = syscall_get_return_value(child, regs); /* is_error is the last field in struct ptrace_syscall_info.exit */ return offsetofend(struct ptrace_syscall_info, exit.is_error); } static int ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, void __user *datavp) { struct pt_regs *regs = task_pt_regs(child); struct ptrace_syscall_info info = { .op = PTRACE_SYSCALL_INFO_NONE, .arch = syscall_get_arch(child), .instruction_pointer = instruction_pointer(regs), .stack_pointer = user_stack_pointer(regs), }; unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); unsigned long write_size; /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() * called earlier by ptrace_check_attach() ensures that * the tracee cannot go away and clear its last_siginfo. */ switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: actual_size = ptrace_get_syscall_info_entry(child, regs, &info); break; case PTRACE_EVENTMSG_SYSCALL_EXIT: actual_size = ptrace_get_syscall_info_exit(child, regs, &info); break; } break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) { bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; kernel_siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; unsigned long flags; switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: return generic_ptrace_peekdata(child, addr, data); case PTRACE_POKETEXT: case PTRACE_POKEDATA: return generic_ptrace_pokedata(child, addr, data); #ifdef PTRACE_OLDSETOPTIONS case PTRACE_OLDSETOPTIONS: #endif case PTRACE_SETOPTIONS: ret = ptrace_setoptions(child, data); break; case PTRACE_GETEVENTMSG: ret = put_user(child->ptrace_message, datalp); break; case PTRACE_PEEKSIGINFO: ret = ptrace_peek_siginfo(child, addr, data); break; case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) ret = copy_siginfo_to_user(datavp, &siginfo); break; case PTRACE_SETSIGINFO: ret = copy_siginfo_from_user(&siginfo, datavp); if (!ret) ret = ptrace_setsiginfo(child, &siginfo); break; case PTRACE_GETSIGMASK: { sigset_t *mask; if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } if (test_tsk_restore_sigmask(child)) mask = &child->saved_sigmask; else mask = &child->blocked; if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; } case PTRACE_SETSIGMASK: { sigset_t new_set; if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { ret = -EFAULT; break; } sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); /* * Every thread does recalc_sigpending() after resume, so * retarget_shared_pending() and recalc_sigpending() are not * called here. */ spin_lock_irq(&child->sighand->siglock); child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); clear_tsk_restore_sigmask(child); ret = 0; break; } case PTRACE_INTERRUPT: /* * Stop tracee without any side-effect on signal or job * control. At least one trap is guaranteed to happen * after this request. If @child is already trapped, the * current trap is not disturbed and another trap will * happen after the current trap is ended with PTRACE_CONT. * * The actual trap might not be PTRACE_EVENT_STOP trap but * the pending condition is cleared regardless. */ if (unlikely(!seized || !lock_task_sighand(child, &flags))) break; /* * INTERRUPT doesn't disturb existing trap sans one * exception. If ptracer issued LISTEN for the current * STOP, this INTERRUPT should clear LISTEN and re-trap * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; break; case PTRACE_LISTEN: /* * Listen for events. Tracee must be in STOP. It's not * resumed per-se but is not considered to be in TRACED by * wait(2) or ptrace(2). If an async event (e.g. group * stop state change) happens, tracee will enter STOP trap * again. Alternatively, ptracer can issue INTERRUPT to * finish listening and re-trap tracee into STOP. */ if (unlikely(!seized || !lock_task_sighand(child, &flags))) break; si = child->last_siginfo; if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { child->jobctl |= JOBCTL_LISTENING; /* * If NOTIFY is set, it means event happened between * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); break; case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; #ifdef CONFIG_BINFMT_ELF_FDPIC case PTRACE_GETFDPIC: { struct mm_struct *mm = get_task_mm(child); unsigned long tmp = 0; ret = -ESRCH; if (!mm) break; switch (addr) { case PTRACE_GETFDPIC_EXEC: tmp = mm->context.exec_fdpic_loadmap; break; case PTRACE_GETFDPIC_INTERP: tmp = mm->context.interp_fdpic_loadmap; break; default: break; } mmput(mm); ret = put_user(tmp, datalp); break; } #endif case PTRACE_SINGLESTEP: #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif #ifdef PTRACE_SYSEMU case PTRACE_SYSEMU: case PTRACE_SYSEMU_SINGLESTEP: #endif case PTRACE_SYSCALL: case PTRACE_CONT: return ptrace_resume(child, request, data); case PTRACE_KILL: send_sig_info(SIGKILL, SEND_SIG_NOINFO, child); return 0; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: case PTRACE_SETREGSET: { struct iovec kiov; struct iovec __user *uiov = datavp; if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(kiov.iov_base, &uiov->iov_base) || __get_user(kiov.iov_len, &uiov->iov_len)) return -EFAULT; ret = ptrace_regset(child, request, addr, &kiov); if (!ret) ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; #endif case PTRACE_SECCOMP_GET_FILTER: ret = seccomp_get_filter(child, addr, datavp); break; case PTRACE_SECCOMP_GET_METADATA: ret = seccomp_get_metadata(child, addr, datavp); break; #ifdef CONFIG_RSEQ case PTRACE_GET_RSEQ_CONFIGURATION: ret = ptrace_get_rseq_configuration(child, addr, datavp); break; #endif case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG: ret = syscall_user_dispatch_set_config(child, addr, datavp); break; case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG: ret = syscall_user_dispatch_get_config(child, addr, datavp); break; default: break; } return ret; } SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { struct task_struct *child; long ret; if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); goto out; } child = find_get_task_by_vpid(pid); if (!child) { ret = -ESRCH; goto out; } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); goto out_put_task_struct; } ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); if (ret < 0) goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); if (ret || request != PTRACE_DETACH) ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); out: return ret; } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data) { unsigned long tmp; int copied; copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); } int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data) { int copied; copied = ptrace_access_vm(tsk, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } #if defined CONFIG_COMPAT int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data) { compat_ulong_t __user *datap = compat_ptr(data); compat_ulong_t word; kernel_siginfo_t siginfo; int ret; switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: ret = ptrace_access_vm(child, addr, &word, sizeof(word), FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else ret = put_user(word, datap); break; case PTRACE_POKETEXT: case PTRACE_POKEDATA: ret = ptrace_access_vm(child, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; case PTRACE_GETEVENTMSG: ret = put_user((compat_ulong_t) child->ptrace_message, datap); break; case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) ret = copy_siginfo_to_user32( (struct compat_siginfo __user *) datap, &siginfo); break; case PTRACE_SETSIGINFO: ret = copy_siginfo_from_user32( &siginfo, (struct compat_siginfo __user *) datap); if (!ret) ret = ptrace_setsiginfo(child, &siginfo); break; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: case PTRACE_SETREGSET: { struct iovec kiov; struct compat_iovec __user *uiov = (struct compat_iovec __user *) datap; compat_uptr_t ptr; compat_size_t len; if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(ptr, &uiov->iov_base) || __get_user(len, &uiov->iov_len)) return -EFAULT; kiov.iov_base = compat_ptr(ptr); kiov.iov_len = len; ret = ptrace_regset(child, request, addr, &kiov); if (!ret) ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } #endif default: ret = ptrace_request(child, request, addr, data); } return ret; } COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, compat_long_t, addr, compat_long_t, data) { struct task_struct *child; long ret; if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); goto out; } child = find_get_task_by_vpid(pid); if (!child) { ret = -ESRCH; goto out; } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); goto out_put_task_struct; } ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); if (ret || request != PTRACE_DETACH) ptrace_unfreeze_traced(child); } out_put_task_struct: put_task_struct(child); out: return ret; } #endif /* CONFIG_COMPAT */
3 3 3 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 // SPDX-License-Identifier: GPL-2.0-or-later /* * cgroups support for the BFQ I/O scheduler. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> #include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) { int ret; ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); if (ret) return ret; atomic64_set(&stat->aux_cnt, 0); return 0; } static void bfq_stat_exit(struct bfq_stat *stat) { percpu_counter_destroy(&stat->cpu_cnt); } /** * bfq_stat_add - add a value to a bfq_stat * @stat: target bfq_stat * @val: value to add * * Add @val to @stat. The caller must ensure that IRQ on the same CPU * don't re-enter this function for the same counter. */ static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) { percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * bfq_stat_read - read the current value of a bfq_stat * @stat: bfq_stat to read */ static inline uint64_t bfq_stat_read(struct bfq_stat *stat) { return percpu_counter_sum_positive(&stat->cpu_cnt); } /** * bfq_stat_reset - reset a bfq_stat * @stat: bfq_stat to reset */ static inline void bfq_stat_reset(struct bfq_stat *stat) { percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } /** * bfq_stat_add_aux - add a bfq_stat into another's aux count * @to: the destination bfq_stat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void bfq_stat_add_aux(struct bfq_stat *to, struct bfq_stat *from) { atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), &to->aux_cnt); } /** * blkg_prfill_stat - prfill callback for bfq_stat * @sf: seq_file to print to * @pd: policy private data of interest * @off: offset to the bfq_stat in @pd * * prfill callback for printing a bfq_stat. */ static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); } /* bfqg stats flags */ enum bfqg_stats_flags { BFQG_stats_waiting = 0, BFQG_stats_idling, BFQG_stats_empty, }; #define BFQG_FLAG_FNS(name) \ static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ { \ stats->flags |= (1 << BFQG_stats_##name); \ } \ static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ { \ stats->flags &= ~(1 << BFQG_stats_##name); \ } \ static int bfqg_stats_##name(struct bfqg_stats *stats) \ { \ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ } \ BFQG_FLAG_FNS(waiting) BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS /* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_waiting(stats)) return; now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_waiting(stats)) return; if (bfqg == curr_bfqg) return; stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { u64 now; if (!bfqg_stats_empty(stats)) return; now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (blkg_rwstat_total(&stats->queued)) return; /* * group is already marked empty. This can happen if bfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ if (bfqg_stats_empty(stats)) return; stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup * internal structures and BFQ-specific structures. */ static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct bfq_group, pd) : NULL; } struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) { return pd_to_blkg(&bfqg->pd); } static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) { return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); } /* * bfq_group handlers * The following functions help in navigating the bfq_group hierarchy * by allowing to find the parent of a bfq_group or the bfq_group * associated to a bfq_queue. */ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) { struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; return pblkg ? blkg_to_bfqg(pblkg) : NULL; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { struct bfq_entity *group_entity = bfqq->entity.parent; return group_entity ? container_of(group_entity, struct bfq_group, entity) : bfqq->bfqd->root_group; } /* * The following two functions handle get and put of a bfq_group by * wrapping the related blk-cgroup hooks. */ static void bfqg_get(struct bfq_group *bfqg) { refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); blkg_get(bfqg_to_blkg(bfqg)); } void bfqg_and_blkg_put(struct bfq_group *bfqg) { blkg_put(bfqg_to_blkg(bfqg)); bfqg_put(bfqg); } void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) { struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); if (!bfqg) return; blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); } /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); bfq_stat_reset(&stats->time); bfq_stat_reset(&stats->avg_queue_size_sum); bfq_stat_reset(&stats->avg_queue_size_samples); bfq_stat_reset(&stats->dequeue); bfq_stat_reset(&stats->group_wait_time); bfq_stat_reset(&stats->idle_time); bfq_stat_reset(&stats->empty_time); #endif } /* @to += @from */ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) { if (!to || !from) return; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); bfq_stat_add_aux(&from->time, &from->time); bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); bfq_stat_add_aux(&to->dequeue, &from->dequeue); bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); bfq_stat_add_aux(&to->idle_time, &from->idle_time); bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this bfqg after * it's gone. */ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) { struct bfq_group *parent; if (!bfqg) /* root_group */ return; parent = bfqg_parent(bfqg); lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; bfqg_stats_add_aux(&parent->stats, &bfqg->stats); bfqg_stats_reset(&bfqg->stats); } void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; /* * Make sure that bfqg and its associated blkg do not * disappear before entity. */ bfqg_and_blkg_get(bfqg); } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; } static void bfqg_stats_exit(struct bfqg_stats *stats) { blkg_rwstat_exit(&stats->bytes); blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); bfq_stat_exit(&stats->time); bfq_stat_exit(&stats->avg_queue_size_sum); bfq_stat_exit(&stats->avg_queue_size_samples); bfq_stat_exit(&stats->dequeue); bfq_stat_exit(&stats->group_wait_time); bfq_stat_exit(&stats->idle_time); bfq_stat_exit(&stats->empty_time); #endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || bfq_stat_init(&stats->time, gfp) || bfq_stat_init(&stats->avg_queue_size_sum, gfp) || bfq_stat_init(&stats->avg_queue_size_samples, gfp) || bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || bfq_stat_init(&stats->empty_time, gfp)) goto error; #endif return 0; error: bfqg_stats_exit(stats); return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) { return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; } static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) { return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { struct bfq_group_data *bgd; bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; bgd->weight = CGROUP_WEIGHT_DFL; return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) { kfree(cpd_to_bfqgd(cpd)); } static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; if (bfqg_stats_init(&bfqg->stats, gfp)) { kfree(bfqg); return NULL; } /* see comments in bfq_bic_update_cgroup for why refcounting */ refcount_set(&bfqg->ref, 1); return &bfqg->pd; } static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); struct bfq_data *bfqd = blkg->q->elevator->elevator_data; struct bfq_entity *entity = &bfqg->entity; struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; entity->last_bfqq_created = NULL; bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; bfqg->num_queues_with_pending_reqs = 0; bfqg->rq_pos_tree = RB_ROOT; } static void bfq_pd_free(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); bfqg_put(bfqg); } static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_reset(&bfqg->stats); } static void bfq_group_set_parent(struct bfq_group *bfqg, struct bfq_group *parent) { struct bfq_entity *entity; entity = &bfqg->entity; entity->parent = parent->my_entity; entity->sched_data = &parent->sched_data; } static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { struct bfq_group *parent; struct bfq_entity *entity; /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet * to the private hierarchy of BFQ. */ entity = &bfqg->entity; for_each_entity(entity) { struct bfq_group *curr_bfqg = container_of(entity, struct bfq_group, entity); if (curr_bfqg != bfqd->root_group) { parent = bfqg_parent(curr_bfqg); if (!parent) parent = bfqd->root_group; bfq_group_set_parent(curr_bfqg, parent); } } } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct bfq_group *bfqg; while (blkg) { if (!blkg->online) { blkg = blkg->parent; continue; } bfqg = blkg_to_bfqg(blkg); if (bfqg->pd.online) { bio_associate_blkg_from_css(bio, &blkg->blkcg->css); return bfqg; } blkg = blkg->parent; } bio_associate_blkg_from_css(bio, &bfqg_to_blkg(bfqd->root_group)->blkcg->css); return bfqd->root_group; } /** * bfq_bfqq_move - migrate @bfqq to @bfqg. * @bfqd: queue descriptor. * @bfqq: the queue to move. * @bfqg: the group to move to. * * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * * Must be called under the scheduler lock, to make sure that the blkg * owning @bfqg does not disappear (see comments in * bfq_bic_update_cgroup on guaranteeing the consistency of blkg * objects). */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when * root group is offlined */ if (old_parent == bfqg) return; /* * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group * until elevator exit. */ if (bfqq == &bfqd->oom_bfqq) return; /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ bfqq->ref++; if (entity->in_groups_with_pending_reqs) { has_pending_reqs = true; bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); } /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we * need to remove bfqq explicitly with bfq_deactivate_bfqq, as * we do below. */ if (bfqq == bfqd->in_service_queue) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); if (bfq_bfqq_busy(bfqq)) bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); if (has_pending_reqs) bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } static void bfq_sync_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *sync_bfqq, struct bfq_io_cq *bic, struct bfq_group *bfqg, unsigned int act_idx) { struct bfq_queue *bfqq; if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { /* We are the only user of this bfqq, just move it */ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) bfq_bfqq_move(bfqd, sync_bfqq, bfqg); return; } /* * The queue was merged to a different queue. Check * that the merge chain still belongs to the same * cgroup. */ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) if (bfqq->entity.sched_data != &bfqg->sched_data) break; if (bfqq) { /* * Some queue changed cgroup so the merge is not valid * anymore. We cannot easily just cancel the merge (by * clearing new_bfqq) as there may be other processes * using this queue and holding refs to all queues * below sync_bfqq->new_bfqq. Similarly if the merge * already happened, we need to detach from bfqq now * so that we cannot merge bio to a request from the * old cgroup. */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); bfq_release_process_ref(bfqd, sync_bfqq); } } /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { unsigned int act_idx; for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); if (async_bfqq && async_bfqq->entity.sched_data != &bfqg->sched_data) { bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } if (sync_bfqq) bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) return; /* * New cgroup for this process. Make sure it is linked to bfq internal * cgroup hierarchy. */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); bic->blkcg_serial_nr = serial_nr; } /** * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. * @st: the service tree being flushed. */ static void bfq_flush_idle_tree(struct bfq_service_tree *st) { struct bfq_entity *entity = st->first_idle; for (; entity ; entity = st->first_idle) __bfq_deactivate_entity(entity, false); } /** * bfq_reparent_leaf_entity - move leaf entity to the root_group. * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, int ioprio_class) { struct bfq_queue *bfqq; struct bfq_entity *child_entity = entity; while (child_entity->my_sched_data) { /* leaf not reached yet */ struct bfq_sched_data *child_sd = child_entity->my_sched_data; struct bfq_service_tree *child_st = child_sd->service_tree + ioprio_class; struct rb_root *child_active = &child_st->active; child_entity = bfq_entity_of(rb_first(child_active)); if (!child_entity) child_entity = child_sd->in_service_entity; } bfqq = bfq_entity_to_bfqq(child_entity); bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); } /** * bfq_reparent_active_queues - move to the root group all active queues. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, struct bfq_service_tree *st, int ioprio_class) { struct rb_root *active = &st->active; struct bfq_entity *entity; while ((entity = bfq_entity_of(rb_first(active)))) bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); if (bfqg->sched_data.in_service_entity) bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.in_service_entity, ioprio_class); } /** * bfq_pd_offline - deactivate the entity associated with @pd, * and reparent its children entities. * @pd: descriptor of the policy going offline. * * blkio already grabs the queue_lock for us, so no need to use * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) { struct bfq_service_tree *st; struct bfq_group *bfqg = pd_to_bfqg(pd); struct bfq_data *bfqd = bfqg->bfqd; struct bfq_entity *entity = bfqg->my_entity; unsigned long flags; int i; spin_lock_irqsave(&bfqd->lock, flags); if (!entity) /* root group */ goto put_async_queues; /* * Empty all service_trees belonging to this group before * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; /* * It may happen that some queues are still active * (busy) upon group destruction (if the corresponding * processes have been forced to terminate). We move * all the leaf entities corresponding to these queues * to the root_group. * Also, it may happen that the group has an entity * in service, which is disconnected from the active * tree: it must be moved, too. * There is no need to put the sync queues, as the * scheduler has taken no reference. */ bfq_reparent_active_queues(bfqd, bfqg, st, i); /* * The idle tree may still contain bfq_queues * belonging to exited task because they never * migrated to a different cgroup from the one being * destroyed now. In addition, even * bfq_reparent_active_queues() may happen to add some * entities to the idle tree. It happens if, in some * of the calls to bfq_bfqq_move() performed by * bfq_reparent_active_queues(), the queue to move is * empty and gets expired. */ bfq_flush_idle_tree(st); } __bfq_deactivate_entity(entity, false); put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ bfqg_stats_xfer_dead(bfqg); } void bfq_end_wr_async(struct bfq_data *bfqd) { struct blkcg_gq *blkg; list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); unsigned int val = 0; if (bfqgd) val = bfqgd->weight; seq_printf(sf, "%u\n", val); return 0; } static u64 bfqg_prfill_weight_device(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); if (!bfqg->entity.dev_weight) return 0; return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); } static int bfq_io_show_weight(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); seq_printf(sf, "default %u\n", bfqgd->weight); blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, &blkcg_policy_bfq, 0, false); return 0; } static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) { weight = dev_weight ?: weight; bfqg->entity.dev_weight = dev_weight; /* * Setting the prio_changed flag of the entity * to 1 with new_weight == weight would re-set * the value of the weight to its ioprio mapping. * Set the flag only if necessary. */ if ((unsigned short)weight != bfqg->entity.new_weight) { bfqg->entity.new_weight = (unsigned short)weight; /* * Make sure that the above new value has been * stored in bfqg->entity.new_weight before * setting the prio_changed flag. In fact, * this flag may be read asynchronously (in * critical sections protected by a different * lock than that held here), and finding this * flag set may cause the execution of the code * for updating parameters whose value may * depend also on bfqg->entity.new_weight (in * __bfq_entity_update_weight_prio). * This barrier makes sure that the new value * of bfqg->entity.new_weight is correctly * seen in that code. */ smp_wmb(); bfqg->entity.prio_changed = 1; } } static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); struct blkcg_gq *blkg; int ret = -ERANGE; if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) return ret; ret = 0; spin_lock_irq(&blkcg->lock); bfqgd->weight = (unsigned short)val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); if (bfqg) bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret; struct blkg_conf_ctx ctx; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct bfq_group *bfqg; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ ret = -ERANGE; if (!v) goto out; } else if (!strcmp(strim(ctx.body), "default")) { v = 0; } else { ret = -EINVAL; goto out; } bfqg = blkg_to_bfqg(ctx.blkg); ret = -ERANGE; if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { bfq_group_set_weight(bfqg, bfqg->entity.weight, v); ret = 0; } out: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char *endp; int ret; u64 v; buf = strim(buf); /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); return ret ?: nbytes; } return bfq_io_set_device_weight(of, buf, nbytes, off); } static int bfqg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct bfq_stat *stat; if (!pos_blkg->online) continue; stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); return __blkg_prfill_u64(sf, pd, sum); } static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_stat_recursive, &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample tmp; blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, false); return 0; } static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); return 0; } /* print avg_queue_size */ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, 0, false); return 0; } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; return blkg_to_bfqg(bfqd->queue->root_blkg); } struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, .pd_init_fn = bfq_pd_init, .pd_offline_fn = bfq_pd_offline, .pd_free_fn = bfq_pd_free, .pd_reset_stats_fn = bfq_pd_reset_stats, }; struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, { .name = "bfq.weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat, }, { .name = "bfq.sectors", .seq_show = bfqg_print_stat_sectors, }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_wait_time", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_merged", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_queued", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", .private = offsetof(struct bfq_group, stats.bytes), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", .private = offsetof(struct bfq_group, stats.ios), .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat_recursive, }, { .name = "bfq.sectors_recursive", .seq_show = bfqg_print_stat_sectors_recursive, }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_wait_time_recursive", .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_merged_recursive", .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_queued_recursive", .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.avg_queue_size", .seq_show = bfqg_print_avg_queue_size, }, { .name = "bfq.group_wait_time", .private = offsetof(struct bfq_group, stats.group_wait_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.idle_time", .private = offsetof(struct bfq_group, stats.idle_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.empty_time", .private = offsetof(struct bfq_group, stats.empty_time), .seq_show = bfqg_print_stat, }, { .name = "bfq.dequeue", .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, #endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; struct cftype bfq_blkg_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, {} /* terminate */ }; #else /* CONFIG_BFQ_GROUP_IOSCHED */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->weight = entity->new_weight; entity->orig_weight = entity->new_weight; if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; } entity->sched_data = &bfqg->sched_data; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} void bfq_end_wr_async(struct bfq_data *bfqd) { bfq_end_wr_async_queues(bfqd, bfqd->root_group); } struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } struct bfq_group *bfqq_group(struct bfq_queue *bfqq) { return bfqq->bfqd->root_group; } void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { struct bfq_group *bfqg; int i; bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); if (!bfqg) return NULL; for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; return bfqg; } #endif /* CONFIG_BFQ_GROUP_IOSCHED */
4 1 1 1 1 1 1 1 1 82 1 3 3 1 1 1 1 44 44 44 44 44 44 44 25 25 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_drr.c Deficit Round Robin scheduler * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct drr_class { struct Qdisc_class_common common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; struct Qdisc *qdisc; u32 quantum; u32 deficit; }; struct drr_sched { struct list_head active; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc_class_hash clhash; }; static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct drr_class, common); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, }; static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)*arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_DRR_MAX + 1]; u32 quantum; int err; if (!opt) { NL_SET_ERR_MSG(extack, "DRR options are required for this operation"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy, extack); if (err < 0) return err; if (tb[TCA_DRR_QUANTUM]) { quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); if (quantum == 0) { NL_SET_ERR_MSG(extack, "Specified DRR quantum cannot be zero"); return -EINVAL; } } else quantum = psched_mtu(qdisc_dev(sch)); if (cl != NULL) { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); return err; } } sch_tree_lock(sch); if (tb[TCA_DRR_QUANTUM]) cl->quantum = quantum; sch_tree_unlock(sch); return 0; } cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else qdisc_hash_add(cl->qdisc, true); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); kfree(cl); return err; } } sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; } static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); } static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; if (qdisc_class_in_use(&cl->common)) { NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; } sch_tree_lock(sch); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); drr_destroy_class(sch, cl); return 0; } static unsigned long drr_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)drr_find_class(sch, classid); } static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); if (cl) { NL_SET_ERR_MSG(extack, "DRR classid must be zero"); return NULL; } return q->block; } static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct drr_class *cl = drr_find_class(sch, classid); if (cl) qdisc_class_get(&cl->common); return (unsigned long)cl; } static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct drr_class *cl = (struct drr_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; return cl->qdisc; } static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; list_del(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct drr_class *cl = (struct drr_class *)arg; struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; __u32 qlen = qdisc_qlen_sum(cl->qdisc); struct Qdisc *cl_q = cl->qdisc; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); if (qlen) xstats.deficit = cl->deficit; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { cl = drr_find_class(sch, skb->priority); if (cl != NULL) return cl; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct drr_class *)res.class; if (cl == NULL) cl = drr_find_class(sch, res.classid); return cl; } return NULL; } static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } if (first) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } sch->qstats.backlog += len; sch->q.qlen++; return err; } static struct sk_buff *drr_dequeue(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct sk_buff *skb; unsigned int len; if (list_empty(&q->active)) goto out; while (1) { cl = list_first_entry(&q->active, struct drr_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); if (skb == NULL) { qdisc_warn_nonwc(__func__, cl->qdisc); goto out; } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } cl->deficit += cl->quantum; list_move_tail(&cl->alist, &q->active); } out: return NULL; } static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; INIT_LIST_HEAD(&q->active); return 0; } static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del(&cl->alist); qdisc_reset(cl->qdisc); } } } static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct hlist_node *next; unsigned int i; tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); } static const struct Qdisc_class_ops drr_class_ops = { .change = drr_change_class, .delete = drr_delete_class, .find = drr_search_class, .tcf_block = drr_tcf_block, .bind_tcf = drr_bind_tcf, .unbind_tcf = drr_unbind_tcf, .graft = drr_graft_class, .leaf = drr_class_leaf, .qlen_notify = drr_qlen_notify, .dump = drr_dump_class, .dump_stats = drr_dump_class_stats, .walk = drr_walk, }; static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .cl_ops = &drr_class_ops, .id = "drr", .priv_size = sizeof(struct drr_sched), .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("drr"); static int __init drr_init(void) { return register_qdisc(&drr_qdisc_ops); } static void __exit drr_exit(void) { unregister_qdisc(&drr_qdisc_ops); } module_init(drr_init); module_exit(drr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deficit Round Robin scheduler");
105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_ALGAPI_H #define _CRYPTO_ALGAPI_H #include <crypto/utils.h> #include <linux/align.h> #include <linux/cache.h> #include <linux/crypto.h> #include <linux/types.h> #include <linux/workqueue.h> /* * Maximum values for blocksize and alignmask, used to allocate * static buffers that are big enough for any combination of * algs and architectures. Ciphers have a lower maximum size. */ #define MAX_ALGAPI_BLOCKSIZE 160 #define MAX_ALGAPI_ALIGNMASK 127 #define MAX_CIPHER_BLOCKSIZE 16 #define MAX_CIPHER_ALIGNMASK 15 #ifdef ARCH_DMA_MINALIGN #define CRYPTO_DMA_ALIGN ARCH_DMA_MINALIGN #else #define CRYPTO_DMA_ALIGN CRYPTO_MINALIGN #endif #define CRYPTO_DMA_PADDING ((CRYPTO_DMA_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) /* * Autoloaded crypto modules should only use a prefixed name to avoid allowing * arbitrary modules to be loaded. Loading from userspace may still need the * unprefixed names, so retains those aliases as well. * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3 * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro * expands twice on the same line. Instead, use a separate base name for the * alias. */ #define MODULE_ALIAS_CRYPTO(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_crypto, "crypto-" name) struct crypto_aead; struct crypto_instance; struct module; struct notifier_block; struct rtattr; struct scatterlist; struct seq_file; struct sk_buff; struct crypto_type { unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask); unsigned int (*extsize)(struct crypto_alg *alg); int (*init_tfm)(struct crypto_tfm *tfm); void (*show)(struct seq_file *m, struct crypto_alg *alg); int (*report)(struct sk_buff *skb, struct crypto_alg *alg); void (*free)(struct crypto_instance *inst); unsigned int type; unsigned int maskclear; unsigned int maskset; unsigned int tfmsize; }; struct crypto_instance { struct crypto_alg alg; struct crypto_template *tmpl; union { /* Node in list of instances after registration. */ struct hlist_node list; /* List of attached spawns before registration. */ struct crypto_spawn *spawns; }; struct work_struct free_work; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_template { struct list_head list; struct hlist_head instances; struct module *module; int (*create)(struct crypto_template *tmpl, struct rtattr **tb); char name[CRYPTO_MAX_ALG_NAME]; }; struct crypto_spawn { struct list_head list; struct crypto_alg *alg; union { /* Back pointer to instance after registration.*/ struct crypto_instance *inst; /* Spawn list pointer prior to registration. */ struct crypto_spawn *next; }; const struct crypto_type *frontend; u32 mask; bool dead; bool registered; }; struct crypto_queue { struct list_head list; struct list_head *backlog; unsigned int qlen; unsigned int max_qlen; }; struct scatter_walk { struct scatterlist *sg; unsigned int offset; }; struct crypto_attr_alg { char name[CRYPTO_MAX_ALG_NAME]; }; struct crypto_attr_type { u32 type; u32 mask; }; /* * Algorithm registration interface. */ int crypto_register_alg(struct crypto_alg *alg); void crypto_unregister_alg(struct crypto_alg *alg); int crypto_register_algs(struct crypto_alg *algs, int count); void crypto_unregister_algs(struct crypto_alg *algs, int count); void crypto_mod_put(struct crypto_alg *alg); int crypto_register_template(struct crypto_template *tmpl); int crypto_register_templates(struct crypto_template *tmpls, int count); void crypto_unregister_template(struct crypto_template *tmpl); void crypto_unregister_templates(struct crypto_template *tmpls, int count); struct crypto_template *crypto_lookup_template(const char *name); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst); void crypto_unregister_instance(struct crypto_instance *inst); int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); void crypto_drop_spawn(struct crypto_spawn *spawn); struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask); void *crypto_spawn_tfm2(struct crypto_spawn *spawn); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb); int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret); const char *crypto_attr_alg_name(struct rtattr *rta); int crypto_inst_setname(struct crypto_instance *inst, const char *name, struct crypto_alg *alg); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request); void crypto_enqueue_request_head(struct crypto_queue *queue, struct crypto_async_request *request); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue); static inline unsigned int crypto_queue_len(struct crypto_queue *queue) { return queue->qlen; } void crypto_inc(u8 *a, unsigned int size); static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) { return tfm->__crt_ctx; } static inline void *crypto_tfm_ctx_align(struct crypto_tfm *tfm, unsigned int align) { if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(crypto_tfm_ctx(tfm), align); } static inline unsigned int crypto_dma_align(void) { return CRYPTO_DMA_ALIGN; } static inline unsigned int crypto_dma_padding(void) { return (crypto_dma_align() - 1) & ~(crypto_tfm_ctx_alignment() - 1); } static inline void *crypto_tfm_ctx_dma(struct crypto_tfm *tfm) { return crypto_tfm_ctx_align(tfm, crypto_dma_align()); } static inline struct crypto_instance *crypto_tfm_alg_instance( struct crypto_tfm *tfm) { return container_of(tfm->__crt_alg, struct crypto_instance, alg); } static inline void *crypto_instance_ctx(struct crypto_instance *inst) { return inst->__ctx; } static inline struct crypto_async_request *crypto_get_backlog( struct crypto_queue *queue) { return queue->backlog == &queue->list ? NULL : container_of(queue->backlog, struct crypto_async_request, list); } static inline u32 crypto_requires_off(struct crypto_attr_type *algt, u32 off) { return (algt->type ^ off) & algt->mask & off; } /* * When an algorithm uses another algorithm (e.g., if it's an instance of a * template), these are the flags that should always be set on the "outer" * algorithm if any "inner" algorithm has them set. */ #define CRYPTO_ALG_INHERITED_FLAGS \ (CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK | \ CRYPTO_ALG_ALLOCATES_MEMORY) /* * Given the type and mask that specify the flags restrictions on a template * instance being created, return the mask that should be passed to * crypto_grab_*() (along with type=0) to honor any request the user made to * have any of the CRYPTO_ALG_INHERITED_FLAGS clear. */ static inline u32 crypto_algt_inherited_mask(struct crypto_attr_type *algt) { return crypto_requires_off(algt, CRYPTO_ALG_INHERITED_FLAGS); } int crypto_register_notifier(struct notifier_block *nb); int crypto_unregister_notifier(struct notifier_block *nb); /* Crypto notification events. */ enum { CRYPTO_MSG_ALG_REQUEST, CRYPTO_MSG_ALG_REGISTER, CRYPTO_MSG_ALG_LOADED, }; static inline void crypto_request_complete(struct crypto_async_request *req, int err) { req->complete(req->data, err); } static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK; } #endif /* _CRYPTO_ALGAPI_H */
15306 5 184 15306 5 184 4323 15306 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ #include <asm/processor-flags.h> #ifndef __ASSEMBLY__ #include <asm/nospec-branch.h> /* * Interrupt control: */ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ extern inline unsigned long native_save_fl(void); extern __always_inline unsigned long native_save_fl(void) { unsigned long flags; /* * "=rm" is safe here, because "pop" adjusts the stack before * it evaluates its effective address -- this is part of the * documented behavior of the "pop" instruction. */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" : "=rm" (flags) : /* no input */ : "memory"); return flags; } static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); } static __always_inline void native_irq_enable(void) { asm volatile("sti": : :"memory"); } static __always_inline void native_safe_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static __always_inline void native_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } static __always_inline int native_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } static __always_inline unsigned long native_local_irq_save(void) { unsigned long flags = native_save_fl(); native_irq_disable(); return flags; } static __always_inline void native_local_irq_restore(unsigned long flags) { if (!native_irqs_disabled_flags(flags)) native_irq_enable(); } #endif #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #ifndef __ASSEMBLY__ #include <linux/types.h> static __always_inline unsigned long arch_local_save_flags(void) { return native_save_fl(); } static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); } static __always_inline void arch_local_irq_enable(void) { native_irq_enable(); } /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ static __always_inline void arch_safe_halt(void) { native_safe_halt(); } /* * Used when interrupts are already enabled or to * shutdown the processor: */ static __always_inline void halt(void) { native_halt(); } /* * For spinlocks, etc: */ static __always_inline unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); return flags; } #else #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS pushfq; popq %rax #endif #endif #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLY__ static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } static __always_inline int arch_irqs_disabled(void) { unsigned long flags = arch_local_save_flags(); return arch_irqs_disabled_flags(flags); } static __always_inline void arch_local_irq_restore(unsigned long flags) { if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } #endif /* !__ASSEMBLY__ */ #endif
12 12 12 12 12 12 12 1 12 12 12 12 12 12 1 11 11 12 1 12 12 1 12 12 32 32 32 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2016 Qualcomm Atheros, Inc * * Based on net/sched/sch_fq_codel.c */ #ifndef __NET_SCHED_FQ_IMPL_H #define __NET_SCHED_FQ_IMPL_H #include <net/fq.h> /* functions that are embedded into includer */ static void __fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; int idx; tin->backlog_bytes -= bytes; tin->backlog_packets -= packets; flow->backlog -= bytes; fq->backlog -= packets; fq->memory_usage -= truesize; if (flow->backlog) return; if (flow == &tin->default_flow) { list_del_init(&tin->tin_list); return; } idx = flow - fq->flows; __clear_bit(idx, fq->flows_bitmap); } static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { struct sk_buff *skb; lockdep_assert_held(&fq->lock); skb = __skb_dequeue(&flow->queue); if (!skb) return NULL; fq_adjust_removal(fq, flow, skb); return skb; } static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { unsigned int packets = 0, bytes = 0, truesize = 0; struct fq_tin *tin = flow->tin; struct sk_buff *skb; int pending; lockdep_assert_held(&fq->lock); pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); do { skb = __skb_dequeue(&flow->queue); if (!skb) break; packets++; bytes += skb->len; truesize += skb->truesize; free_func(fq, tin, flow, skb); } while (packets < pending); __fq_adjust_removal(fq, flow, packets, bytes, truesize); return packets; } static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) { struct fq_flow *flow; struct list_head *head; struct sk_buff *skb; lockdep_assert_held(&fq->lock); begin: head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += fq->quantum; list_move_tail(&flow->flowchain, &tin->old_flows); goto begin; } skb = dequeue_func(fq, tin, flow); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &tin->new_flows) && !list_empty(&tin->old_flows)) { list_move_tail(&flow->flowchain, &tin->old_flows); } else { list_del_init(&flow->flowchain); flow->tin = NULL; } goto begin; } flow->deficit -= skb->len; tin->tx_bytes += skb->len; tin->tx_packets++; return skb; } static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) { u32 hash = skb_get_hash(skb); return reciprocal_scale(hash, fq->flows_cnt); } static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { flow = &tin->default_flow; tin->collisions++; fq->collisions++; } if (!flow->tin) tin->flows++; return flow; } static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { struct fq_tin *tin; struct fq_flow *flow = NULL; u32 len = 0; int i; for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { struct fq_flow *cur = &fq->flows[i]; unsigned int cur_len; cur_len = cur->backlog; if (cur_len <= len) continue; flow = cur; len = cur_len; } list_for_each_entry(tin, &fq->tin_backlog, tin_list) { unsigned int cur_len = tin->default_flow.backlog; if (cur_len <= len) continue; flow = &tin->default_flow; len = cur_len; } return flow; } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func) { struct fq_flow *flow; struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); flow = fq_flow_classify(fq, tin, idx, skb); if (!flow->backlog) { if (flow != &tin->default_flow) __set_bit(idx, fq->flows_bitmap); else if (list_empty(&tin->tin_list)) list_add(&tin->tin_list, &fq->tin_backlog); } flow->tin = tin; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; fq->memory_usage += skb->truesize; fq->backlog++; __skb_queue_tail(&flow->queue, skb); } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, &tin->new_flows); } oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); if (!flow) return; if (!fq_flow_drop(fq, flow, free_func)) return; flow->tin->overlimit++; fq->overlimit++; if (oom) { fq->overmemory++; oom = (fq->memory_usage > fq->memory_limit); } } } static void fq_flow_filter(struct fq *fq, struct fq_flow *flow, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb, *tmp; lockdep_assert_held(&fq->lock); skb_queue_walk_safe(&flow->queue, skb, tmp) { if (!filter_func(fq, tin, flow, skb, filter_data)) continue; __skb_unlink(skb, &flow->queue); fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } } static void fq_tin_filter(struct fq *fq, struct fq_tin *tin, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); list_for_each_entry(flow, &tin->new_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); list_for_each_entry(flow, &tin->old_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); } static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) free_func(fq, tin, flow, skb); if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); if (list_empty(&tin->new_flows) && list_empty(&tin->old_flows)) list_del_init(&tin->tin_list); } flow->tin = NULL; WARN_ON_ONCE(flow->backlog); } static void fq_tin_reset(struct fq *fq, struct fq_tin *tin, fq_skb_free_t free_func) { struct list_head *head; struct fq_flow *flow; for (;;) { head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) break; } flow = list_first_entry(head, struct fq_flow, flowchain); fq_flow_reset(fq, flow, free_func); } WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); __skb_queue_head_init(&flow->queue); } static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); INIT_LIST_HEAD(&tin->tin_list); fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) { int i; memset(fq, 0, sizeof(fq[0])); spin_lock_init(&fq->lock); INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL); if (!fq->flows_bitmap) { kvfree(fq->flows); fq->flows = NULL; return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); return 0; } static void fq_reset(struct fq *fq, fq_skb_free_t free_func) { int i; for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); kvfree(fq->flows); fq->flows = NULL; bitmap_free(fq->flows_bitmap); fq->flows_bitmap = NULL; } #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(ct->status & IPS_SRC_NAT)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif
75 1947 1991 1996 1996 201 1929 43 44 2011 1858 1873 40 40 40 40 40 40 23 40 40 37 4 93 40 40 171 172 170 79 73 24 7 86 2 1 1994 2011 1985 1996 1858 1860 9 1862 29 1988 1859 1997 1995 1500 1366 2010 14 1994 1997 2011 170 2129 2145 1846 400 398 1057 1058 1057 1058 1026 1305 2134 1850 1607 44 45 1 5 5 5 5 4 1 1 2 5 4 2 6 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * * .. note:: * This only updates the timer expiry value and does not requeue the timer. * * There is also a variant of the function hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the * callback and the caller needs to take care of serialization. * * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) { return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) timer->function = hrtimer_dummy_timeout; else timer->function = function; } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); /** * hrtimer_setup_on_stack - initialize a timer on stack memory * @timer: The timer to be initialized * @function: the callback function * @clock_id: The clock to be used * @mode: The timer mode * * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack * memory. */ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); }
9 7 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ #include "hash.h" #include "main.h" #include <linux/gfp.h> #include <linux/lockdep.h> #include <linux/slab.h> /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) { u32 i; for (i = 0; i < hash->size; i++) { INIT_HLIST_HEAD(&hash->table[i]); spin_lock_init(&hash->list_locks[i]); } atomic_set(&hash->generation, 0); } /** * batadv_hash_destroy() - Free only the hashtable and the hash itself * @hash: hash object to destroy */ void batadv_hash_destroy(struct batadv_hashtable *hash) { kfree(hash->list_locks); kfree(hash->table); kfree(hash); } /** * batadv_hash_new() - Allocates and clears the hashtable * @size: number of hash buckets to allocate * * Return: newly allocated hashtable, NULL on errors */ struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; hash = kmalloc(sizeof(*hash), GFP_ATOMIC); if (!hash) return NULL; hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC); if (!hash->table) goto free_hash; hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks), GFP_ATOMIC); if (!hash->list_locks) goto free_table; hash->size = size; batadv_hash_init(hash); return hash; free_table: kfree(hash->table); free_hash: kfree(hash); return NULL; } /** * batadv_hash_set_lock_class() - Set specific lockdep class for hash spinlocks * @hash: hash object to modify * @key: lockdep class key address */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key) { u32 i; for (i = 0; i < hash->size; i++) lockdep_set_class(&hash->list_locks[i], key); }
20 2331 146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM maple_tree #if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MM_H #include <linux/tracepoint.h> struct ma_state; TRACE_EVENT(ma_op, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_read, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_write, TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, void *val), TP_ARGS(fn, mas, piv, val), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(unsigned long, piv) __field(void *, val) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->piv = piv; __entry->val = val; __entry->node = mas->node; ), TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last, (unsigned long) __entry->piv, (void *) __entry->val ) ) #endif /* _TRACE_MM_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for SMC Connections, Link Groups and Links * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef _SMC_CORE_H #define _SMC_CORE_H #include <linux/atomic.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> #include <net/smc.h> #include "smc.h" #include "smc_ib.h" #include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ #define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, * also is the default value for SMC-R v1 and v2.0 */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 16-255 as needed. */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ u32 num; /* unique link group number */ }; enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ }; enum smc_link_state { /* possible states of a link */ SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; struct smc_wr_v2_buf { u8 raw[SMC_WR_BUF_V2_SIZE]; }; #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { POSTED, /* ib_wr_reg_mr request posted */ CONFIRMED, /* ib_wr_reg_mr response: successful */ FAILED /* ib_wr_reg_mr response: failure */ }; struct smc_rdma_sge { /* sges for RDMA writes */ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; }; #define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per * message send */ struct smc_rdma_sges { /* sges per message send */ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; }; struct smc_rdma_wr { /* work requests per message * send */ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; #define SMC_LGR_ID_SIZE 4 struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ struct ib_sge *wr_tx_sges; /* WR send gather meta data */ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ struct { struct percpu_ref wr_tx_refs; } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ struct { struct percpu_ref wr_reg_refs; } ____cacheline_aligned_in_smp; struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ u8 clearing : 1; /* link is being cleared */ refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ #define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the * default value for smc-r v1.0 and v2.0 */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 1-2 as needed. */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ union { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; /* memory region: for rmb and * vzalloced sndbuf * incl. rkey provided to peer * and lkey provided to local */ u32 order; /* allocation order */ u8 is_conf_rkey; /* confirm_rkey done */ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ u8 is_vm; /* virtually contiguous */ }; struct { /* SMC-D */ unsigned short sba_idx; /* SBA index number */ u64 token; /* DMB token number */ dma_addr_t dma_addr; /* DMA address */ }; }; }; struct smc_rtoken { /* address/key of remote RMB */ u64 dma_addr; u32 rkey; }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ struct smcd_dev; enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_NONE, /* no active links, lgr to be deleted */ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ SMCR_PHYS_CONT_BUFS = 0, SMCR_VIRT_CONT_BUFS = 1, SMCR_MIXED_BUFS = 2, }; enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; struct smc_llc_qentry; struct smc_llc_flow { enum smc_llc_flowtype type; struct smc_llc_qentry *qentry; }; struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ u64 alloc_sndbufs; /* stats of tx buffers */ u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; u8 peer_os; /* peer operating system */ u8 peer_smc_release; u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; union { struct { /* SMC-R */ enum smc_lgr_role role; /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ struct smc_wr_v2_buf *wr_rx_buf_v2; /* WR v2 recv payload buffer */ struct smc_wr_v2_buf *wr_tx_buf_v2; /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] [SMC_LINKS_PER_LGR_MAX]; /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_flow_waiter; /* w4 next llc event */ wait_queue_head_t llc_msg_waiter; /* w4 next llc msg */ struct smc_llc_flow llc_flow_lcl; /* llc local control field */ struct smc_llc_flow llc_flow_rmt; /* llc remote control field */ struct smc_llc_qentry *delayed_event; /* arrived when flow active */ spinlock_t llc_flow_lock; /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; /* net namespace */ struct net *net; u8 max_conns; /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ u8 peer_shutdown : 1; /* peer triggered shutdownn */ }; }; }; struct smc_clc_msg_local; #define GID_LIST_SIZE 2 struct smc_gidlist { u8 len; u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; }; struct smc_init_info_smcrv2 { /* Input fields */ __be32 saddr; struct sock *clc_sk; __be32 daddr; /* Output fields when saddr is set */ struct smc_ib_device *ib_dev_v2; u8 ib_port_v2; u8 ib_gid_v2[SMC_GID_SIZE]; /* Additional output fields when clc_sk and daddr is set as well */ u8 uses_gateway; u8 nexthop_mac[ETH_ALEN]; struct smc_gidlist gidlist; }; #define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES /* max # of proposed non-native ISM devices, * which can't exceed the max # of CHID-GID * entries in CLC proposal SMC-Dv2 extension. */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; u8 release_nr; u8 max_conns; u8 max_links; u8 first_contact_peer; u8 first_contact_local; u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ u8 smcr_version; u8 check_smcrv2; u8 peer_gid[SMC_GID_SIZE]; u8 peer_mac[ETH_ALEN]; u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; }; /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock * @token alert token to search for * @lgr link group to search in * Returns connection associated with token if found, NULL otherwise. */ static inline struct smc_connection *smc_lgr_find_conn( u32 token, struct smc_link_group *lgr) { struct smc_connection *res = NULL; struct rb_node *node; node = lgr->conns_all.rb_node; while (node) { struct smc_connection *cur = rb_entry(node, struct smc_connection, alert_node); if (cur->alert_token_local > token) { node = node->rb_left; } else { if (cur->alert_token_local < token) { node = node->rb_right; } else { res = cur; break; } } } return res; } static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { return conn->lgr && conn->alert_token_local; } /* * Returns true if the specified link is usable. * * usable means the link is ready to receive RDMA messages, map memory * on the link, etc. This doesn't ensure we are able to send RDMA messages * on this link, if sending RDMA messages is needed, use smc_link_sendable() */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) return false; return true; } /* * Returns true if the specified link is ready to receive AND send RDMA * messages. * * For the client side in first contact, the underlying QP may still in * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() * is not strong enough. For those places that need to send any CDC or LLC * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && lnk->qp_attr.cur_qp_state == IB_QPS_RTS; } static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; } static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", be16_to_cpu(((__be16 *)gid_raw)[0]), be16_to_cpu(((__be16 *)gid_raw)[1]), be16_to_cpu(((__be16 *)gid_raw)[2]), be16_to_cpu(((__be16 *)gid_raw)[3]), be16_to_cpu(((__be16 *)gid_raw)[4]), be16_to_cpu(((__be16 *)gid_raw)[5]), be16_to_cpu(((__be16 *)gid_raw)[6]), be16_to_cpu(((__be16 *)gid_raw)[7])); } struct smc_pci_dev { __u32 pci_fid; __u16 pci_pchid; __u16 pci_vendor; __u16 pci_device; __u8 pci_id[SMC_PCI_ID_STR_LEN]; }; static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_pci_dev *smc_dev) { smc_dev->pci_vendor = pci_dev->vendor; smc_dev->pci_device = pci_dev->device; snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", pci_name(pci_dev)); #if IS_ENABLED(CONFIG_S390) { /* Set s390 specific PCI information */ struct zpci_dev *zdev; zdev = to_zpci(pci_dev); smc_dev->pci_fid = zdev->fid; smc_dev->pci_pchid = zdev->pchid; } #endif } struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); int smc_core_init(void); void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); void smcr_link_hold(struct smc_link *lnk); void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; } #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static const struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static ssize_t link_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim_a, *nsim_b, *peer; struct net_device *dev_a, *dev_b; unsigned int ifidx_a, ifidx_b; int netnsfd_a, netnsfd_b, err; struct net *ns_a, *ns_b; err = sscanf(buf, "%d:%u %d:%u", &netnsfd_a, &ifidx_a, &netnsfd_b, &ifidx_b); if (err != 4) { pr_err("Format for linking two devices is \"netnsfd_a:ifidx_a netnsfd_b:ifidx_b\" (int uint int uint).\n"); return -EINVAL; } ns_a = get_net_ns_by_fd(netnsfd_a); if (IS_ERR(ns_a)) { pr_err("Could not find netns with fd: %d\n", netnsfd_a); return -EINVAL; } ns_b = get_net_ns_by_fd(netnsfd_b); if (IS_ERR(ns_b)) { pr_err("Could not find netns with fd: %d\n", netnsfd_b); put_net(ns_a); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev_a = __dev_get_by_index(ns_a, ifidx_a); if (!dev_a) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_a, netnsfd_a); goto out_err; } if (!netdev_is_nsim(dev_a)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_a, netnsfd_a); goto out_err; } dev_b = __dev_get_by_index(ns_b, ifidx_b); if (!dev_b) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_b, netnsfd_b); goto out_err; } if (!netdev_is_nsim(dev_b)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_b, netnsfd_b); goto out_err; } if (dev_a == dev_b) { pr_err("Cannot link a netdevsim to itself\n"); goto out_err; } err = -EBUSY; nsim_a = netdev_priv(dev_a); peer = rtnl_dereference(nsim_a->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_a, ifidx_a); goto out_err; } nsim_b = netdev_priv(dev_b); peer = rtnl_dereference(nsim_b->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_b, ifidx_b); goto out_err; } err = 0; rcu_assign_pointer(nsim_a->peer, nsim_b); rcu_assign_pointer(nsim_b->peer, nsim_a); out_err: put_net(ns_b); put_net(ns_a); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(link_device); static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim, *peer; struct net_device *dev; unsigned int ifidx; int netnsfd, err; struct net *ns; err = sscanf(buf, "%u:%u", &netnsfd, &ifidx); if (err != 2) { pr_err("Format for unlinking a device is \"netnsfd:ifidx\" (int uint).\n"); return -EINVAL; } ns = get_net_ns_by_fd(netnsfd); if (IS_ERR(ns)) { pr_err("Could not find netns with fd: %d\n", netnsfd); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev = __dev_get_by_index(ns, ifidx); if (!dev) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx, netnsfd); goto out_put_netns; } if (!netdev_is_nsim(dev)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx, netnsfd); goto out_put_netns; } nsim = netdev_priv(dev); peer = rtnl_dereference(nsim->peer); if (!peer) goto out_put_netns; err = 0; RCU_INIT_POINTER(nsim->peer, NULL); RCU_INIT_POINTER(peer->peer, NULL); out_put_netns: put_net(ns); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(unlink_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, &bus_attr_link_device.attr, &bus_attr_unlink_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static const struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc(sizeof(*nsim_bus_dev), GFP_KERNEL); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-only /* iptables module to match on related connections */ /* * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_helper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Related connection matching"); MODULE_ALIAS("ipt_helper"); MODULE_ALIAS("ip6t_helper"); static bool helper_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_helper_info *info = par->matchinfo; const struct nf_conn *ct; const struct nf_conn_help *master_help; const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; bool ret = info->invert; ct = nf_ct_get(skb, &ctinfo); if (!ct || !ct->master) return ret; master_help = nfct_help(ct->master); if (!master_help) return ret; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(master_help->helper); if (!helper) return ret; if (info->name[0] == '\0') ret = !ret; else ret ^= !strncmp(helper->name, info->name, strlen(helper->name)); return ret; } static int helper_mt_check(const struct xt_mtchk_param *par) { struct xt_helper_info *info = par->matchinfo; int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } info->name[sizeof(info->name) - 1] = '\0'; return 0; } static void helper_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match helper_mt_reg __read_mostly = { .name = "helper", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, .matchsize = sizeof(struct xt_helper_info), .me = THIS_MODULE, }; static int __init helper_mt_init(void) { return xt_register_match(&helper_mt_reg); } static void __exit helper_mt_exit(void) { xt_unregister_match(&helper_mt_reg); } module_init(helper_mt_init); module_exit(helper_mt_exit);
38 8 14 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/leds.h> #include "ieee80211_i.h" #define MAC80211_BLINK_DELAY 50 /* ms */ static inline void ieee80211_led_rx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->rx_led_active)) return; led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } static inline void ieee80211_led_tx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->tx_led_active)) return; led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } #ifdef CONFIG_MAC80211_LEDS void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); void ieee80211_alloc_led_names(struct ieee80211_local *local); void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) { } static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) { } static inline void ieee80211_led_exit(struct ieee80211_local *local) { } static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { } #endif static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif }
123 43 43 128 128 2 40 2 135 19 19 19 15 134 128 149 127 127 126 3 125 2558 2545 149 254 62 17 2554 2536 131 19 2553 20 2566 148 148 255 2569 129 41 19 132 129 3 130 19 132 133 132 132 128 126 1 1 1 1 1 1 127 127 125 127 9 19 127 3 3 4 126 129 126 127 127 125 18 19 122 3 128 45 2547 143 2569 2545 127 124 3 142 2467 328 133 133 133 132 128 128 127 18 124 129 125 3 127 122 122 3 129 126 1 6 6 3 3 3 3 3 143 18 29 128 125 6 19 19 3 19 19 19 18 123 124 122 122 19 19 3 19 19 19 2 2 15 18 2 2 15 15 15 15 3 15 15 15 19 19 16 19 4 19 19 19 19 19 19 19 19 17 19 24 129 19 19 19 19 19 19 19 19 19 17 17 17 18 19 18 19 15 7 7 7 19 8 19 11 7 4 10 1 11 6 5 11 4 15 3 13 3 19 3 16 17 15 147 144 28 147 147 1 1 1 1 9 7 6 9 9 9 6 5 9 19 3 17 19 1 1 9 1 1 1 1 19 16 3 19 19 19 19 19 19 3 19 19 19 19 19 18 18 19 19 19 19 19 19 19 19 19 5 6 9 6 5 9 2 9 9 9 9 2531 2554 2537 2418 2034 2032 2554 2551 2537 2535 9 9 9 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 5 5 9 9 6 15 1 16 17 17 3 15 15 2 17 17 17 4 12 17 17 17 17 17 17 17 16 16 16 12 2 6 8 2 6 8 6 2 14 17 14 2 6 19 19 6 17 40 41 1 41 40 107 7 126 6 3 147 148 148 147 148 33 7 7 7 7 1 1 1 1 99 1 2 2 2 2 2 2 7 7 7 6 1 7 7 126 105 71 18 123 19 127 126 94 1 125 2 12 125 127 16 2 2 2 2 32 1 4 1 34 3 68 129 26 25 26 144 19 126 60 27 26 2 128 7 126 6 146 3 112 41 126 17 6 128 149 128 148 146 144 36 147 147 143 18 144 25 125 146 142 144 129 45 1 19 19 20 20 20 20 19 131 129 13 13 13 13 2 13 11 128 235 235 15 230 231 237 13 234 238 235 14 130 131 130 3 131 132 130 132 130 129 131 132 131 693 692 695 131 130 7 692 134 694 131 696 699 695 128 131 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 19 19 19 19 19 19 3 19 19 3 290 99 398 97 96 19 3 20 20 1 19 19 19 19 19 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 40 37 3 3 40 40 123 125 127 123 125 6 125 124 121 129 129 128 122 124 122 147 146 147 23 23 127 125 47 125 126 39 10 9 235 236 6 6 236 236 42 42 130 132 99 132 131 1 99 130 132 1 1 1 1 143 2 2 2 2 2 20 20 1 128 126 96 40 2316 2318 2317 2305 3 572 575 2311 3 2314 131 131 130 130 131 132 131 131 131 130 131 130 130 128 128 129 128 132 127 130 130 129 132 130 130 131 131 132 130 128 131 132 129 132 132 130 128 130 132 132 132 130 129 130 129 132 128 132 132 131 128 128 131 132 130 132 130 131 132 132 128 131 132 130 132 132 132 132 132 132 132 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *parent, *node; node = mte_to_node(enode); /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = mte_parent(enode); return (parent == node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count) { if (head->node_count < MAPLE_ALLOC_SLOTS) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->slot[0] = head; reuse->node_count = 1; } reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; BUG_ON(!allocated); WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; node->request_count = 0; requested--; } node = mas->alloc; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; /* find a non-full node*/ do { node = node->slot[0]; } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split, unsigned long min) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_min, slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { slot_min = mt_min_slots[bn->type]; *mid_split = 0; /* * Avoid having a range less than the slot count unless it * causes one node to be deficient. * NOTE: mt_min_slots is 1 based, b_end and split are zero. */ while ((split < slot_count - 1) && ((bn->pivot[split] - min) < slot_count - 1) && (b_end - split > slot_min)) split++; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split, unsigned long min) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split, min); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split, mast->orig_l->min); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) continue; if (mast_overflow(mast)) continue; /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; mas->depth = height; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, int height, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas, height + 1); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= mas->depth) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas, height); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_invalid: MT_BUG_ON(mas->tree, 1); return; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @mas: The maple state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) { int ret = mas_mt_height(mas) * 3 + 1; switch (mas->store_type) { case wr_invalid: WARN_ON_ONCE(1); break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_spanning_store: ret = mas_mt_height(mas) * 3 + 1; break; case wr_split_store: ret = mas_mt_height(mas) * 2 + 1; break; case wr_rebalance: ret = mas_mt_height(mas) * 2 - 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_append: case wr_exact_fit: case wr_slot_store: ret = 0; } return ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; mas_node_count(mas, request); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_next_entry() - Internal function to get the next entry. * @mas: The maple state * @limit: The maximum range start. * * Set the @mas->node to the next entry and the range_start to * the beginning value for the entry. Does not check beyond @limit. * Sets @mas->index and @mas->last to the range, Does not update @mas->index and * @mas->last on overflow. * Restarts on dead nodes. * * Return: the next entry or %NULL. */ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { if (mas->last >= limit) { mas->status = ma_overflow; return NULL; } return mas_next_slot(mas, limit, false); } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; goto done; } if (mas->index <= pivot) { mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } if (mte_is_root(mas->node)) found = true; done: mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, slot == MAPLE_NODE_SLOTS) * found the gap. (return, slot != MAPLE_NODE_SLOTS) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; if (unlikely(offset == MAPLE_NODE_SLOTS)) return -EBUSY; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } request = mas_prealloc_calc(mas, entry); if (!request) goto store; mas_node_count(mas, request); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); int ret = 0; int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_destroy(mas); mas_reset(mas); return ret; } mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; if (mas_is_err(mas)) mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_entry(&mas, max); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (mas.max != ULONG_MAX))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */
11 1 11 7 5 10 24 1 8 8 8 7 5 5 2 3 3 5 4 1 5 5 5 2 2 2 3 18 14 14 1 2 1 1 1 1 5 6 1 6 7 9 9 1 1 2 1 2 1 1 3 3 2 5 1 2 3 3 4 2 6 19 1 1 1 1 3 2 3 5 1 1 1 1 1 2 1 8 1 1 1 1 1 2 1 1 1 6 1 1 1 3 1 3 1 2 2 1 2 2 1 5 18 1 17 11 9 1 1 1 1 1 1 1 1 11 11 5 2 5 25 1 10 23 3 14 10 7 4 2 1 10 33 7 32 1 1 1 20 10 11 7 25 7 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb(&jsk->sk, skb) < 0) kfree_skb(skb); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; while (todo_size) { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
18 18 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 // SPDX-License-Identifier: GPL-2.0-or-later // Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com> #include <linux/err.h> #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/netdevice.h> #include <linux/netfilter_bridge.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif #include "br_private.h" #include "br_private_mcast_eht.h" static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr); static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, bool allow_zero_src); static struct net_bridge_group_eht_host * br_multicast_eht_host_lookup(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct rb_node *node = pg->eht_host_tree.rb_node; while (node) { struct net_bridge_group_eht_host *this; int result; this = rb_entry(node, struct net_bridge_group_eht_host, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static int br_multicast_eht_host_filter_mode(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_host *eht_host; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (!eht_host) return MCAST_INCLUDE; return eht_host->filter_mode; } static struct net_bridge_group_eht_set_entry * br_multicast_eht_set_entry_lookup(struct net_bridge_group_eht_set *eht_set, union net_bridge_eht_addr *h_addr) { struct rb_node *node = eht_set->entry_tree.rb_node; while (node) { struct net_bridge_group_eht_set_entry *this; int result; this = rb_entry(node, struct net_bridge_group_eht_set_entry, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static struct net_bridge_group_eht_set * br_multicast_eht_set_lookup(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr) { struct rb_node *node = pg->eht_set_tree.rb_node; while (node) { struct net_bridge_group_eht_set *this; int result; this = rb_entry(node, struct net_bridge_group_eht_set, rb_node); result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static void __eht_destroy_host(struct net_bridge_group_eht_host *eht_host) { WARN_ON(!hlist_empty(&eht_host->set_entries)); br_multicast_eht_hosts_dec(eht_host->pg); rb_erase(&eht_host->rb_node, &eht_host->pg->eht_host_tree); RB_CLEAR_NODE(&eht_host->rb_node); kfree(eht_host); } static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_eht_set_entry *set_h; set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc); WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node)); timer_shutdown_sync(&set_h->timer); kfree(set_h); } static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_eht_set *eht_set; eht_set = container_of(gc, struct net_bridge_group_eht_set, mcast_gc); WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node)); WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree)); timer_shutdown_sync(&eht_set->timer); kfree(eht_set); } static void __eht_del_set_entry(struct net_bridge_group_eht_set_entry *set_h) { struct net_bridge_group_eht_host *eht_host = set_h->h_parent; union net_bridge_eht_addr zero_addr; rb_erase(&set_h->rb_node, &set_h->eht_set->entry_tree); RB_CLEAR_NODE(&set_h->rb_node); hlist_del_init(&set_h->host_list); memset(&zero_addr, 0, sizeof(zero_addr)); if (memcmp(&set_h->h_addr, &zero_addr, sizeof(zero_addr))) eht_host->num_entries--; hlist_add_head(&set_h->mcast_gc.gc_node, &set_h->br->mcast_gc_list); queue_work(system_long_wq, &set_h->br->mcast_gc_work); if (hlist_empty(&eht_host->set_entries)) __eht_destroy_host(eht_host); } static void br_multicast_del_eht_set(struct net_bridge_group_eht_set *eht_set) { struct net_bridge_group_eht_set_entry *set_h; struct rb_node *node; while ((node = rb_first(&eht_set->entry_tree))) { set_h = rb_entry(node, struct net_bridge_group_eht_set_entry, rb_node); __eht_del_set_entry(set_h); } rb_erase(&eht_set->rb_node, &eht_set->pg->eht_set_tree); RB_CLEAR_NODE(&eht_set->rb_node); hlist_add_head(&eht_set->mcast_gc.gc_node, &eht_set->br->mcast_gc_list); queue_work(system_long_wq, &eht_set->br->mcast_gc_work); } void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg) { struct net_bridge_group_eht_set *eht_set; struct rb_node *node; while ((node = rb_first(&pg->eht_set_tree))) { eht_set = rb_entry(node, struct net_bridge_group_eht_set, rb_node); br_multicast_del_eht_set(eht_set); } } static void br_multicast_eht_set_entry_expired(struct timer_list *t) { struct net_bridge_group_eht_set_entry *set_h = from_timer(set_h, t, timer); struct net_bridge *br = set_h->br; spin_lock(&br->multicast_lock); if (RB_EMPTY_NODE(&set_h->rb_node) || timer_pending(&set_h->timer)) goto out; br_multicast_del_eht_set_entry(set_h->eht_set->pg, &set_h->eht_set->src_addr, &set_h->h_addr); out: spin_unlock(&br->multicast_lock); } static void br_multicast_eht_set_expired(struct timer_list *t) { struct net_bridge_group_eht_set *eht_set = from_timer(eht_set, t, timer); struct net_bridge *br = eht_set->br; spin_lock(&br->multicast_lock); if (RB_EMPTY_NODE(&eht_set->rb_node) || timer_pending(&eht_set->timer)) goto out; br_multicast_del_eht_set(eht_set); out: spin_unlock(&br->multicast_lock); } static struct net_bridge_group_eht_host * __eht_lookup_create_host(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, unsigned char filter_mode) { struct rb_node **link = &pg->eht_host_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_host *eht_host; while (*link) { struct net_bridge_group_eht_host *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_host, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } if (br_multicast_eht_hosts_over_limit(pg)) return NULL; eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC); if (!eht_host) return NULL; memcpy(&eht_host->h_addr, h_addr, sizeof(*h_addr)); INIT_HLIST_HEAD(&eht_host->set_entries); eht_host->pg = pg; eht_host->filter_mode = filter_mode; rb_link_node(&eht_host->rb_node, parent, link); rb_insert_color(&eht_host->rb_node, &pg->eht_host_tree); br_multicast_eht_hosts_inc(pg); return eht_host; } static struct net_bridge_group_eht_set_entry * __eht_lookup_create_set_entry(struct net_bridge *br, struct net_bridge_group_eht_set *eht_set, struct net_bridge_group_eht_host *eht_host, bool allow_zero_src) { struct rb_node **link = &eht_set->entry_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_set_entry *set_h; while (*link) { struct net_bridge_group_eht_set_entry *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_set_entry, rb_node); result = memcmp(&eht_host->h_addr, &this->h_addr, sizeof(union net_bridge_eht_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } /* always allow auto-created zero entry */ if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT) return NULL; set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC); if (!set_h) return NULL; memcpy(&set_h->h_addr, &eht_host->h_addr, sizeof(union net_bridge_eht_addr)); set_h->mcast_gc.destroy = br_multicast_destroy_eht_set_entry; set_h->eht_set = eht_set; set_h->h_parent = eht_host; set_h->br = br; timer_setup(&set_h->timer, br_multicast_eht_set_entry_expired, 0); hlist_add_head(&set_h->host_list, &eht_host->set_entries); rb_link_node(&set_h->rb_node, parent, link); rb_insert_color(&set_h->rb_node, &eht_set->entry_tree); /* we must not count the auto-created zero entry otherwise we won't be * able to track the full list of PG_SRC_ENT_LIMIT entries */ if (!allow_zero_src) eht_host->num_entries++; return set_h; } static struct net_bridge_group_eht_set * __eht_lookup_create_set(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr) { struct rb_node **link = &pg->eht_set_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_set *eht_set; while (*link) { struct net_bridge_group_eht_set *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_set, rb_node); result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC); if (!eht_set) return NULL; memcpy(&eht_set->src_addr, src_addr, sizeof(*src_addr)); eht_set->mcast_gc.destroy = br_multicast_destroy_eht_set; eht_set->pg = pg; eht_set->br = pg->key.port->br; eht_set->entry_tree = RB_ROOT; timer_setup(&eht_set->timer, br_multicast_eht_set_expired, 0); rb_link_node(&eht_set->rb_node, parent, link); rb_insert_color(&eht_set->rb_node, &pg->eht_set_tree); return eht_set; } static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, union net_bridge_eht_addr *dest) { switch (src->proto) { case htons(ETH_P_IP): dest->ip4 = src->src.ip4; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): memcpy(&dest->ip6, &src->src.ip6, sizeof(struct in6_addr)); break; #endif } } static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, int filter_mode) { struct net_bridge_group_eht_host *eht_host; union net_bridge_eht_addr zero_addr; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (eht_host) eht_host->filter_mode = filter_mode; memset(&zero_addr, 0, sizeof(zero_addr)); switch (filter_mode) { case MCAST_INCLUDE: br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); break; case MCAST_EXCLUDE: br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr, h_addr, MCAST_EXCLUDE, true); break; } } static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, bool allow_zero_src) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_host *eht_host; struct net_bridge *br = pg->key.port->br; struct net_bridge_group_eht_set *eht_set; union net_bridge_eht_addr zero_addr; memset(&zero_addr, 0, sizeof(zero_addr)); if (!allow_zero_src && !memcmp(src_addr, &zero_addr, sizeof(zero_addr))) return; eht_set = __eht_lookup_create_set(pg, src_addr); if (!eht_set) return; eht_host = __eht_lookup_create_host(pg, h_addr, filter_mode); if (!eht_host) goto fail_host; set_h = __eht_lookup_create_set_entry(br, eht_set, eht_host, allow_zero_src); if (!set_h) goto fail_set_entry; mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx)); mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx)); return; fail_set_entry: if (hlist_empty(&eht_host->set_entries)) __eht_destroy_host(eht_host); fail_host: if (RB_EMPTY_ROOT(&eht_set->entry_tree)) br_multicast_del_eht_set(eht_set); } static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_set *eht_set; bool set_deleted = false; eht_set = br_multicast_eht_set_lookup(pg, src_addr); if (!eht_set) goto out; set_h = br_multicast_eht_set_entry_lookup(eht_set, h_addr); if (!set_h) goto out; __eht_del_set_entry(set_h); if (RB_EMPTY_ROOT(&eht_set->entry_tree)) { br_multicast_del_eht_set(eht_set); set_deleted = true; } out: return set_deleted; } static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_host *eht_host; struct hlist_node *tmp; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (!eht_host) return; hlist_for_each_entry_safe(set_h, tmp, &eht_host->set_entries, host_list) br_multicast_del_eht_set_entry(set_h->eht_set->pg, &set_h->eht_set->src_addr, &set_h->h_addr); } /* create new set entries from reports */ static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int filter_mode) { union net_bridge_eht_addr eht_src_addr; u32 src_idx; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr, h_addr, filter_mode, false); } } /* delete existing set entries and their (S,G) entries if they were the last */ static bool __eht_del_set_entries(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { union net_bridge_eht_addr eht_src_addr; struct net_bridge_group_src *src_ent; bool changed = false; struct br_ip src_ip; u32 src_idx; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, h_addr)) continue; memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); src_ent = br_multicast_find_group_src(pg, &src_ip); if (!src_ent) continue; br_multicast_del_group_src(src_ent, true); changed = true; } return changed; } static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { bool changed = false; switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE); break; case MCAST_EXCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, addr_size); break; } return changed; } static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { bool changed = false; switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, addr_size); break; case MCAST_EXCLUDE: __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE); break; } return changed; } /* flush_entries is true when changing mode */ static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, unsigned char filter_mode, bool to_report) { bool changed = false, flush_entries = to_report; union net_bridge_eht_addr eht_src_addr; if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode) flush_entries = true; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { struct net_bridge_group_eht_set *eht_set; struct net_bridge_group_src *src_ent; struct hlist_node *tmp; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { br_multicast_ip_src_to_eht_addr(&src_ent->addr, &eht_src_addr); if (!br_multicast_eht_set_lookup(pg, &eht_src_addr)) { br_multicast_del_group_src(src_ent, true); changed = true; continue; } /* this is an optimization for TO_INCLUDE where we lower * the set's timeout to LMQT to catch timeout hosts: * - host A (timing out): set entries X, Y * - host B: set entry Z (new from current TO_INCLUDE) * sends BLOCK Z after LMQT but host A's EHT * entries still exist (unless lowered to LMQT * so they can timeout with the S,Gs) * => we wait another LMQT, when we can just delete the * group immediately */ if (!(src_ent->flags & BR_SGRP_F_SEND) || filter_mode != MCAST_INCLUDE || !to_report) continue; eht_set = br_multicast_eht_set_lookup(pg, &eht_src_addr); if (!eht_set) continue; mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx)); } } return changed; } static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, bool to_report) { bool changed; changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE, to_report); br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE); return changed; } static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, bool to_report) { bool changed; changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE, to_report); br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE); return changed; } static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, int grec_type) { bool changed = false, to_report = false; switch (grec_type) { case IGMPV3_ALLOW_NEW_SOURCES: br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_BLOCK_OLD_SOURCES: changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_INCLUDE: changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; case IGMPV3_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_EXCLUDE: changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; } return changed; } #if IS_ENABLED(CONFIG_IPV6) static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, int grec_type) { bool changed = false, to_report = false; switch (grec_type) { case MLD2_ALLOW_NEW_SOURCES: br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_BLOCK_OLD_SOURCES: changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_INCLUDE: changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; case MLD2_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_EXCLUDE: changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; } return changed; } #endif /* true means an entry was deleted */ bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool eht_enabled = !!(pg->key.port->flags & BR_MULTICAST_FAST_LEAVE); union net_bridge_eht_addr eht_host_addr; bool changed = false; if (!eht_enabled) goto out; memset(&eht_host_addr, 0, sizeof(eht_host_addr)); memcpy(&eht_host_addr, h_addr, addr_size); if (addr_size == sizeof(__be32)) changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs, nsrcs, grec_type); #if IS_ENABLED(CONFIG_IPV6) else changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs, nsrcs, grec_type); #endif out: return changed; } int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, u32 eht_hosts_limit) { struct net_bridge *br = p->br; if (!eht_hosts_limit) return -EINVAL; spin_lock_bh(&br->multicast_lock); p->multicast_eht_hosts_limit = eht_hosts_limit; spin_unlock_bh(&br->multicast_lock); return 0; }
24 24 24 11 23 17 13 24 5 5 5 2 5 5 1 1 4 4 1 2 2 4 3 3 3 3 1 1 1 7 7 7 3 3 2 1 7 7 7 3 3 2 6 1 1 1 1 1 3 3 5 5 1 3 2 1 1 2 2 20 19 6 19 20 20 10 8 19 1 1 1 5 20 3 19 2 19 2 19 19 2 17 14 3 19 19 15 6 4 4 4 4 15 4 16 4 4 3 3 29 31 31 31 3 3 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-only /* * Off-channel operation helpers * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019, 2022-2024 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" /* * Tell our hardware to disable PS. * Optionally inform AP that we will go to sleep so that it will buffer * the frames while we are doing off-channel work. This is optional * because we *may* be doing work on-operating channel, and want our * hardware unconditionally awake, but still let the AP send us normal frames. */ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool offchannel_ps_enabled = false; /* FIXME: what to do when local->pspolling is true? */ del_timer_sync(&local->dynamic_ps_timer); del_timer_sync(&ifmgd->bcn_mon_timer); del_timer_sync(&ifmgd->conn_mon_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the * hardware is creating the nullfunc frame for power save * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not * enabled) and power save was enabled, the firmware just * sent a null frame with power save disabled. So we need * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again */ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) ieee80211_send_nullfunc(local, sdata, false); else if (local->hw.conf.dynamic_ps_timeout > 0) { /* * the dynamic_ps_timer had been running before leaving the * operating channel, restart the timer now and send a nullfunc * frame to inform the AP that we are awake so that AP sends * the buffered packets (if any). */ ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; /* * notify the AP about us leaving the channel and stop all * STA interfaces. */ /* * Stop queues and transmit all frames queued by the driver * before sending nullfunc to enable powersave at the AP. */ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); ieee80211_flush_queues(local, NULL, false); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); /* Check to see if we should disable beaconing. */ if (sdata->vif.bss_conf.enable_beacon) { set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); sdata->vif.bss_conf.enable_beacon = false; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); } } void ieee80211_offchannel_return(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); if (!ieee80211_sdata_running(sdata)) continue; /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_disable(sdata); if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state)) { sdata->vif.bss_conf.enable_beacon = true; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); } static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) { /* was never transmitted */ if (roc->frame) { cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->frame->data, roc->frame->len, false, GFP_KERNEL); ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); } if (!roc->mgmt_tx_cookie) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); else cfg80211_tx_mgmt_expired(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); } static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local, unsigned long now) { struct ieee80211_roc_work *roc, *tmp; long remaining_dur_min = LONG_MAX; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { long remaining; if (!roc->started) break; remaining = roc->start_time + msecs_to_jiffies(roc->duration) - now; /* In case of HW ROC, it is possible that the HW finished the * ROC session before the actual requested time. In such a case * end the ROC session (disregarding the remaining time). */ if (roc->abort || roc->hw_begun || remaining <= 0) ieee80211_roc_notify_destroy(roc); else remaining_dur_min = min(remaining_dur_min, remaining); } return remaining_dur_min; } static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, unsigned long now) { long dur = ieee80211_end_finished_rocs(local, now); if (dur == LONG_MAX) return false; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, dur); return true; } static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, unsigned long start_time) { if (WARN_ON(roc->notified)) return; roc->start_time = start_time; roc->started = true; if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, roc->chan->band); roc->frame = NULL; } } else { cfg80211_ready_on_channel(&roc->sdata->wdev, roc->cookie, roc->chan, roc->req_duration, GFP_KERNEL); } roc->notified = true; } static void ieee80211_hw_roc_start(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_start); struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(roc, &local->roc_list, list) { if (!roc->started) break; roc->hw_begun = true; ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } } void ieee80211_ready_on_channel(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); local->hw_roc_start_time = jiffies; trace_api_ready_on_channel(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_start); } EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); static void _ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; enum ieee80211_roc_type type; u32 min_dur, max_dur; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(list_empty(&local->roc_list))) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON(roc->started)) return; min_dur = roc->duration; max_dur = roc->duration; type = roc->type; list_for_each_entry(tmp, &local->roc_list, list) { if (tmp == roc) continue; if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; max_dur = max(tmp->duration, max_dur); min_dur = min(tmp->duration, min_dur); type = max(tmp->type, type); } if (local->ops->remain_on_channel) { int ret = drv_remain_on_channel(local, roc->sdata, roc->chan, max_dur, type); if (ret) { wiphy_warn(local->hw.wiphy, "failed to start next HW ROC (%d)\n", ret); /* * queue the work struct again to avoid recursion * when multiple failures occur */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; tmp->abort = true; } wiphy_work_queue(local->hw.wiphy, &local->hw_roc_done); return; } /* we'll notify about the start once the HW calls back */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; } } else { /* If actually operating on the desired channel (with at least * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so * other ROC operations won't interfere with this one. * * Note: scan can't run, tmp_channel is what we use, so this * must be the currently active channel. */ roc->on_channel = roc->chan == local->hw.conf.chandef.chan && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_5 && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_10; /* start this ROC */ ieee80211_recalc_idle(local); if (!roc->on_channel) { ieee80211_offchannel_stop_vifs(local); local->tmp_channel = roc->chan; ieee80211_hw_conf_chan(local); } wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, msecs_to_jiffies(min_dur)); /* tell userspace or send frame(s) */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->on_channel = roc->on_channel; ieee80211_handle_roc_started(tmp, jiffies); } } } void ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); if (list_empty(&local->roc_list)) { ieee80211_run_deferred_scan(local); return; } /* defer roc if driver is not started (i.e. during reconfig) */ if (local->in_reconfig) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON_ONCE(roc->started)) return; if (local->ops->remain_on_channel) { _ieee80211_start_next_roc(local); } else { /* delay it a bit */ wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, round_jiffies_relative(HZ / 2)); } } void ieee80211_reconfig_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; /* * In the software implementation can just continue with the * interruption due to reconfig, roc_work is still queued if * needed. */ if (!local->ops->remain_on_channel) return; /* flush work so nothing from the driver is still pending */ wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); wiphy_work_flush(local->hw.wiphy, &local->hw_roc_done); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (!roc->hw_begun) { /* it didn't start in HW yet, so we can restart it */ roc->started = false; continue; } /* otherwise destroy it and tell userspace */ ieee80211_roc_notify_destroy(roc); } ieee80211_start_next_roc(local); } static void __ieee80211_roc_work(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; bool on_channel; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(local->ops->remain_on_channel)) return; roc = list_first_entry_or_null(&local->roc_list, struct ieee80211_roc_work, list); if (!roc) return; if (!roc->started) { WARN_ON(!local->emulate_chanctx); _ieee80211_start_next_roc(local); } else { on_channel = roc->on_channel; if (ieee80211_recalc_sw_work(local, jiffies)) return; /* careful - roc pointer became invalid during recalc */ if (!on_channel) { ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; ieee80211_hw_conf_chan(local); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_start_next_roc(local); } } static void ieee80211_roc_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, roc_work.work); lockdep_assert_wiphy(local->hw.wiphy); __ieee80211_roc_work(local); } static void ieee80211_hw_roc_done(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_done); lockdep_assert_wiphy(local->hw.wiphy); ieee80211_end_finished_rocs(local, jiffies); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); } void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_remain_on_channel_expired(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_done); } EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); static bool ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { unsigned long now = jiffies; unsigned long remaining; if (WARN_ON(!cur_roc->started)) return false; /* if it was scheduled in the hardware, but not started yet, * we can only combine if the older one had a longer duration */ if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration) return false; remaining = cur_roc->start_time + msecs_to_jiffies(cur_roc->duration) - now; /* if it doesn't fit entirely, schedule a new one */ if (new_roc->duration > jiffies_to_msecs(remaining)) return false; /* add just after the current one so we combine their finish later */ list_add(&new_roc->list, &cur_roc->list); /* if the existing one has already begun then let this one also * begin, otherwise they'll both be marked properly by the work * struct that runs once the driver notifies us of the beginning */ if (cur_roc->hw_begun) { new_roc->hw_begun = true; ieee80211_handle_roc_started(new_roc, now); } return true; } static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, unsigned int duration, u64 *cookie, struct sk_buff *txskb, enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; bool queued = false, combine_started = true; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (channel->freq_offset) /* this may work, but is untested */ return -EOPNOTSUPP; if (!local->emulate_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; roc = kzalloc(sizeof(*roc), GFP_KERNEL); if (!roc) return -ENOMEM; /* * If the duration is zero, then the driver * wouldn't actually do anything. Set it to * 10 for now. * * TODO: cancel the off-channel operation * when we get the SKB's TX status and * the wait time was zero before. */ if (!duration) duration = 10; roc->chan = channel; roc->duration = duration; roc->req_duration = duration; roc->frame = txskb; roc->type = type; roc->sdata = sdata; /* * cookie is either the roc cookie (for normal roc) * or the SKB (for mgmt TX) */ if (!txskb) { roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { roc->mgmt_tx_cookie = *cookie; } /* if there's no need to queue, handle it immediately */ if (list_empty(&local->roc_list) && !local->scanning && !ieee80211_is_radar_required(local)) { /* if not HW assist, just queue & schedule work */ if (!local->ops->remain_on_channel) { list_add_tail(&roc->list, &local->roc_list); wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } else { /* otherwise actually kick it off here * (for error handling) */ ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); return ret; } roc->started = true; list_add_tail(&roc->list, &local->roc_list); } return 0; } /* otherwise handle queueing */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->chan != channel || tmp->sdata != sdata) continue; /* * Extend this ROC if possible: If it hasn't started, add * just after the new one to combine. */ if (!tmp->started) { list_add(&roc->list, &tmp->list); queued = true; break; } if (!combine_started) continue; if (!local->ops->remain_on_channel) { /* If there's no hardware remain-on-channel, and * doing so won't push us over the maximum r-o-c * we allow, then we can just add the new one to * the list and mark it as having started now. * If it would push over the limit, don't try to * combine with other started ones (that haven't * been running as long) but potentially sort it * with others that had the same fate. */ unsigned long now = jiffies; u32 elapsed = jiffies_to_msecs(now - tmp->start_time); struct wiphy *wiphy = local->hw.wiphy; u32 max_roc = wiphy->max_remain_on_channel_duration; if (elapsed + roc->duration > max_roc) { combine_started = false; continue; } list_add(&roc->list, &tmp->list); queued = true; roc->on_channel = tmp->on_channel; ieee80211_handle_roc_started(roc, now); ieee80211_recalc_sw_work(local, now); break; } queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp); if (queued) break; /* if it wasn't queued, perhaps it can be combined with * another that also couldn't get combined previously, * but no need to check for already started ones, since * that can't work. */ combine_started = false; } if (!queued) list_add_tail(&roc->list, &local->roc_list); return 0; } int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return ieee80211_start_roc_work(local, sdata, chan, duration, cookie, NULL, IEEE80211_ROC_TYPE_NORMAL); } static int ieee80211_cancel_roc(struct ieee80211_local *local, u64 cookie, bool mgmt_tx) { struct ieee80211_roc_work *roc, *tmp, *found = NULL; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (!cookie) return -ENOENT; wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) continue; else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) continue; found = roc; break; } if (!found) { return -ENOENT; } if (!found->started) { ieee80211_roc_notify_destroy(found); goto out_unlock; } if (local->ops->remain_on_channel) { ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { return ret; } /* * We could be racing against the notification from the driver: * + driver is handling the notification on CPU0 * + user space is cancelling the remain on channel and * schedules the hw_roc_done worker. * * Now hw_roc_done might start to run after the next roc will * start and mac80211 will think that this second roc has * ended prematurely. * Cancel the work to make sure that all the pending workers * have completed execution. * Note that this assumes that by the time the driver returns * from drv_cancel_remain_on_channel, it has completed all * the processing of related notifications. */ wiphy_work_cancel(local->hw.wiphy, &local->hw_roc_done); /* TODO: * if multiple items were combined here then we really shouldn't * cancel them all - we should wait for as much time as needed * for the longest remaining one, and only then cancel ... */ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (roc == found) found = NULL; ieee80211_roc_notify_destroy(roc); } /* that really must not happen - it was started */ WARN_ON(found); ieee80211_start_next_roc(local); } else { /* go through work struct to return to the operating channel */ found->abort = true; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } out_unlock: return 0; } int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; return ieee80211_cancel_roc(local, cookie, false); } int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct sta_info *sta = NULL; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; bool mlo_sta = false; int link_id = -1; u32 flags; int ret; u8 *data; lockdep_assert_wiphy(local->hw.wiphy); if (params->dont_wait_for_ack) flags = IEEE80211_TX_CTL_NO_ACK; else flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | IEEE80211_TX_CTL_REQ_TX_STATUS; if (params->no_cck) flags |= IEEE80211_TX_CTL_NO_CCK_RATE; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: if (sdata->vif.type != NL80211_IFTYPE_ADHOC && !ieee80211_vif_is_mesh(&sdata->vif) && !sdata->bss->active) need_offchan = true; rcu_read_lock(); sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { rcu_read_unlock(); break; } if (!sta) { rcu_read_unlock(); return -ENOLINK; } if (params->link_id >= 0 && !(sta->sta.valid_links & BIT(params->link_id))) { rcu_read_unlock(); return -ENOLINK; } link_id = params->link_id; rcu_read_unlock(); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!sdata->u.mgd.associated || (params->offchan && params->wait && local->ops->remain_on_channel && memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN))) { need_offchan = true; } else if (sdata->u.mgd.associated && ether_addr_equal(sdata->vif.cfg.ap_addr, mgmt->da)) { sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; } break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } /* configurations requiring offchan cannot work if no channel has been * specified */ if (need_offchan && !params->chan) return -EINVAL; /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; rcu_read_lock(); /* Check all the links first */ for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[i]); if (!conf) continue; chanctx_conf = rcu_dereference(conf->chanctx_conf); if (!chanctx_conf) continue; if (mlo_sta && params->chan == chanctx_conf->def.chan && ether_addr_equal(sdata->vif.addr, mgmt->sa)) { link_id = i; break; } if (ether_addr_equal(conf->addr, mgmt->sa)) { /* If userspace requested Tx on a specific link * use the same link id if the link bss is matching * the requested chan. */ if (sdata->vif.valid_links && params->link_id >= 0 && params->link_id == i && params->chan == chanctx_conf->def.chan) link_id = i; break; } chanctx_conf = NULL; } if (chanctx_conf) { need_offchan = params->chan && (params->chan != chanctx_conf->def.chan); } else { need_offchan = true; } rcu_read_unlock(); } if (need_offchan && !params->offchan) { ret = -EBUSY; goto out_unlock; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); if (!skb) { ret = -ENOMEM; goto out_unlock; } skb_reserve(skb, local->hw.extra_tx_headroom); data = skb_put_data(skb, params->buf, params->len); /* Update CSA counters */ if (sdata->vif.bss_conf.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (beacon) for (i = 0; i < params->n_csa_offsets; i++) data[params->csa_offsets[i]] = beacon->cntdwn_current_counter; rcu_read_unlock(); } IEEE80211_SKB_CB(skb)->flags = flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; skb->dev = sdata->dev; if (!params->dont_wait_for_ack) { /* make a copy to preserve the frame contents * in case of encryption. */ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL); if (ret) { kfree_skb(skb); goto out_unlock; } } else { /* Assign a dummy non-zero cookie, it's not sent to * userspace in this case but we rely on its value * internally in the need_offchan case to distinguish * mgmt-tx from remain-on-channel. */ *cookie = 0xffffffff; } if (!need_offchan) { ieee80211_tx_skb_tid(sdata, skb, 7, link_id); ret = 0; goto out_unlock; } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; /* This will handle all kinds of coalescing and immediate TX */ ret = ieee80211_start_roc_work(local, sdata, params->chan, params->wait, cookie, skb, IEEE80211_ROC_TYPE_MGMT_TX); if (ret) ieee80211_free_txskb(&local->hw, skb); out_unlock: return ret; } int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_local *local = wiphy_priv(wiphy); return ieee80211_cancel_roc(local, cookie, true); } void ieee80211_roc_setup(struct ieee80211_local *local) { wiphy_work_init(&local->hw_roc_start, ieee80211_hw_roc_start); wiphy_work_init(&local->hw_roc_done, ieee80211_hw_roc_done); wiphy_delayed_work_init(&local->roc_work, ieee80211_roc_work); INIT_LIST_HEAD(&local->roc_list); } void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_roc_work *roc, *tmp; bool work_to_do = false; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (sdata && roc->sdata != sdata) continue; if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ drv_cancel_remain_on_channel(local, roc->sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; work_to_do = true; } } else { ieee80211_roc_notify_destroy(roc); } } if (work_to_do) __ieee80211_roc_work(local); }
100 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * Copyright (C) 2014 Marvell International Ltd. * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> */ #ifndef __NET_NFC_H #define __NET_NFC_H #include <linux/nfc.h> #include <linux/device.h> #include <linux/skbuff.h> #define nfc_dbg(dev, fmt, ...) dev_dbg((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_info(dev, fmt, ...) dev_info((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_err(dev, fmt, ...) dev_err((dev), "NFC: " fmt, ##__VA_ARGS__) struct nfc_phy_ops { int (*write)(void *dev_id, struct sk_buff *skb); int (*enable)(void *dev_id); void (*disable)(void *dev_id); }; struct nfc_dev; /** * data_exchange_cb_t - Definition of nfc_data_exchange callback * * @context: nfc_data_exchange cb_context parameter * @skb: response data * @err: If an error has occurred during data exchange, it is the * error number. Zero means no error. * * When a rx or tx package is lost or corrupted or the target gets out * of the operating field, err is -EIO. */ typedef void (*data_exchange_cb_t)(void *context, struct sk_buff *skb, int err); typedef void (*se_io_cb_t)(void *context, u8 *apdu, size_t apdu_len, int err); struct nfc_target; struct nfc_ops { int (*dev_up)(struct nfc_dev *dev); int (*dev_down)(struct nfc_dev *dev); int (*start_poll)(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols); void (*stop_poll)(struct nfc_dev *dev); int (*dep_link_up)(struct nfc_dev *dev, struct nfc_target *target, u8 comm_mode, u8 *gb, size_t gb_len); int (*dep_link_down)(struct nfc_dev *dev); int (*activate_target)(struct nfc_dev *dev, struct nfc_target *target, u32 protocol); void (*deactivate_target)(struct nfc_dev *dev, struct nfc_target *target, u8 mode); int (*im_transceive)(struct nfc_dev *dev, struct nfc_target *target, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context); int (*tm_send)(struct nfc_dev *dev, struct sk_buff *skb); int (*check_presence)(struct nfc_dev *dev, struct nfc_target *target); int (*fw_download)(struct nfc_dev *dev, const char *firmware_name); /* Secure Element API */ int (*discover_se)(struct nfc_dev *dev); int (*enable_se)(struct nfc_dev *dev, u32 se_idx); int (*disable_se)(struct nfc_dev *dev, u32 se_idx); int (*se_io) (struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context); }; #define NFC_TARGET_IDX_ANY -1 #define NFC_MAX_GT_LEN 48 #define NFC_ATR_RES_GT_OFFSET 15 #define NFC_ATR_REQ_GT_OFFSET 14 /** * struct nfc_target - NFC target description * * @sens_res: 2 bytes describing the target SENS_RES response, if the target * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. * @ats_len: length of Answer To Select in bytes * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; u32 supported_protocols; u16 sens_res; u8 sel_res; u8 nfcid1_len; u8 nfcid1[NFC_NFCID1_MAXSIZE]; u8 nfcid2_len; u8 nfcid2[NFC_NFCID2_MAXSIZE]; u8 sensb_res_len; u8 sensb_res[NFC_SENSB_RES_MAXSIZE]; u8 sensf_res_len; u8 sensf_res[NFC_SENSF_RES_MAXSIZE]; u8 hci_reader_gate; u8 logical_idx; u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; u8 ats_len; u8 ats[NFC_ATS_MAXSIZE]; }; /** * nfc_se - A structure for NFC accessible secure elements. * * @idx: The secure element index. User space will enable or * disable a secure element by its index. * @type: The secure element type. It can be SE_UICC or * SE_EMBEDDED. * @state: The secure element state, either enabled or disabled. * */ struct nfc_se { struct list_head list; u32 idx; u16 type; u16 state; }; /** * nfc_evt_transaction - A struct for NFC secure element event transaction. * * @aid: The application identifier triggering the event * * @aid_len: The application identifier length [5:16] * * @params: The application parameters transmitted during the transaction * * @params_len: The applications parameters length [0:255] * */ #define NFC_MIN_AID_LENGTH 5 #define NFC_MAX_AID_LENGTH 16 #define NFC_MAX_PARAMS_LENGTH 255 #define NFC_EVT_TRANSACTION_AID_TAG 0x81 #define NFC_EVT_TRANSACTION_PARAMS_TAG 0x82 struct nfc_evt_transaction { u32 aid_len; u8 aid[NFC_MAX_AID_LENGTH]; u8 params_len; u8 params[]; } __packed; struct nfc_genl_data { u32 poll_req_portid; struct mutex genl_data_mutex; }; struct nfc_vendor_cmd { __u32 vendor_id; __u32 subcmd; int (*doit)(struct nfc_dev *dev, void *data, size_t data_len); }; struct nfc_dev { int idx; u32 target_next_idx; struct nfc_target *targets; int n_targets; int targets_generation; struct device dev; bool dev_up; bool fw_download_in_progress; u8 rf_mode; bool polling; struct nfc_target *active_target; bool dep_link_up; struct nfc_genl_data genl_data; u32 supported_protocols; struct list_head secure_elements; int tx_headroom; int tx_tailroom; struct timer_list check_pres_timer; struct work_struct check_pres_work; bool shutting_down; struct rfkill *rfkill; const struct nfc_vendor_cmd *vendor_cmds; int n_vendor_cmds; const struct nfc_ops *ops; struct genl_info *cur_cmd_info; }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) extern const struct class nfc_class; struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom); /** * nfc_free_device - free nfc device * * @dev: The nfc device to free */ static inline void nfc_free_device(struct nfc_dev *dev) { put_device(&dev->dev); } int nfc_register_device(struct nfc_dev *dev); void nfc_unregister_device(struct nfc_dev *dev); /** * nfc_set_parent_dev - set the parent device * * @nfc_dev: The nfc device whose parent is being set * @dev: The parent device */ static inline void nfc_set_parent_dev(struct nfc_dev *nfc_dev, struct device *dev) { nfc_dev->dev.parent = dev; } /** * nfc_set_drvdata - set driver specific data * * @dev: The nfc device * @data: Pointer to driver specific data */ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) { dev_set_drvdata(&dev->dev, data); } /** * nfc_get_drvdata - get driver specific data * * @dev: The nfc device */ static inline void *nfc_get_drvdata(const struct nfc_dev *dev) { return dev_get_drvdata(&dev->dev); } /** * nfc_device_name - get the nfc device name * * @dev: The nfc device whose name to return */ static inline const char *nfc_device_name(const struct nfc_dev *dev) { return dev_name(&dev->dev); } struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err); struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp); int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result); int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int ntargets); int nfc_target_lost(struct nfc_dev *dev, u32 target_idx); int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, const u8 *gb, size_t gb_len); int nfc_tm_deactivated(struct nfc_dev *dev); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb); void nfc_driver_failure(struct nfc_dev *dev, int err); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx); struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx); void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, u8 payload_type, u8 direction); static inline int nfc_set_vendor_cmds(struct nfc_dev *dev, const struct nfc_vendor_cmd *cmds, int n_cmds) { if (dev->vendor_cmds || dev->n_vendor_cmds) return -EINVAL; dev->vendor_cmds = cmds; dev->n_vendor_cmds = n_cmds; return 0; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen); int nfc_vendor_cmd_reply(struct sk_buff *skb); /** * nfc_vendor_cmd_alloc_reply_skb - allocate vendor command reply * @dev: nfc device * @oui: vendor oui * @approxlen: an upper bound of the length of the data that will * be put into the skb * * This function allocates and pre-fills an skb for a reply to * a vendor command. Since it is intended for a reply, calling * it outside of a vendor command's doit() operation is invalid. * * The returned skb is pre-filled with some identifying data in * a way that any data that is put into the skb (with skb_put(), * nla_put() or similar) will end up being within the * %NFC_ATTR_VENDOR_DATA attribute, so all that needs to be done * with the skb is adding data for the corresponding userspace tool * which can then read that data out of the vendor data attribute. * You must not modify the skb in any other way. * * When done, call nfc_vendor_cmd_reply() with the skb and return * its error code as the result of the doit() operation. * * Return: An allocated and pre-filled skb. %NULL if any errors happen. */ static inline struct sk_buff * nfc_vendor_cmd_alloc_reply_skb(struct nfc_dev *dev, u32 oui, u32 subcmd, int approxlen) { return __nfc_alloc_vendor_cmd_reply_skb(dev, NFC_ATTR_VENDOR_DATA, oui, subcmd, approxlen); } #endif /* __NET_NFC_H */
12 1 4 4 3 1 4 1 1 10 1 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_TUNNEL_H #define _NET_IP6_TUNNEL_H #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/dst_cache.h> #define IP6TUNNEL_ERR_TIMEO (30*HZ) /* capable of sending packets */ #define IP6_TNL_F_CAP_XMIT 0x10000 /* capable of receiving packets */ #define IP6_TNL_F_CAP_RCV 0x20000 /* determine capability on a per-packet basis */ #define IP6_TNL_F_CAP_PER_PACKET 0x40000 struct __ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ int link; /* ifindex of underlying L2 interface */ __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; __u32 fwmark; __u32 index; /* ERSPAN type II index */ __u8 erspan_ver; /* ERSPAN version */ __u8 dir; /* direction */ __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_cache dst_cache; /* cached dst */ struct gro_cells gro_cells; int err_count; unsigned long err_time; /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ struct ip_tunnel_encap encap; int mlink; }; struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET extern const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap); static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) { const struct ip6_tnl_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, u8 *protocol, struct flowi6 *fl6) { const struct ip6_tnl_encap_ops *ops; int ret = -EINVAL; if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[t->encap.type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, &t->encap, protocol, fl6); rcu_read_unlock(); return ret; } /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto); __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef SMC_TX_H #define SMC_TX_H #include <linux/socket.h> #include <linux/types.h> #include "smc.h" #include "smc_cdc.h" static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal); #endif /* SMC_TX_H */
1 9 9 1 7 1 8 9 4 2 4 6 6 6 6 5 5 5 4 2 1 3 5 8 7 6 6 1 5 6 6 2 1 3 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
9 2 2 1 4 4 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-only /* * ebt_ip6 * * Authors: * Manohar Castelino <manohar.r.castelino@intel.com> * Kuo-Lang Tseng <kuo-lang.tseng@intel.com> * Jan Engelhardt <jengelh@medozas.de> * * Summary: * This is just a modification of the IPv4 code written by * Bart De Schuymer <bdschuym@pandora.be> * with the changes required to support IPv6 * * Jan, 2008 */ #include <linux/ipv6.h> #include <net/ipv6.h> #include <linux/in.h> #include <linux/module.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip6.h> union pkthdr { struct { __be16 src; __be16 dst; } tcpudphdr; struct { u8 type; u8 code; } icmphdr; }; static bool ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_ip6_info *info = par->matchinfo; const struct ipv6hdr *ih6; struct ipv6hdr _ip6h; const union pkthdr *pptr; union pkthdr _pkthdr; ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) return false; if ((info->bitmask & EBT_IP6_TCLASS) && NF_INVF(info, EBT_IP6_TCLASS, info->tclass != ipv6_get_dsfield(ih6))) return false; if (((info->bitmask & EBT_IP6_SOURCE) && NF_INVF(info, EBT_IP6_SOURCE, ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, &info->saddr))) || ((info->bitmask & EBT_IP6_DEST) && NF_INVF(info, EBT_IP6_DEST, ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, &info->daddr)))) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; __be16 frag_off; int offset_ph; offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off); if (offset_ph == -1) return false; if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr)) return false; if (!(info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT | EBT_IP6_ICMP6))) return true; /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP6_DPORT) { u16 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP6_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP6_SPORT) { u16 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP6_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP6_ICMP6) && NF_INVF(info, EBT_IP6_ICMP6, pptr->icmphdr.type < info->icmpv6_type[0] || pptr->icmphdr.type > info->icmpv6_type[1] || pptr->icmphdr.code < info->icmpv6_code[0] || pptr->icmphdr.code > info->icmpv6_code[1])) return false; } return true; } static int ebt_ip6_mt_check(const struct xt_mtchk_param *par) { const struct ebt_entry *e = par->entryinfo; struct ebt_ip6_info *info = par->matchinfo; if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO) return -EINVAL; if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK) return -EINVAL; if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) { if (info->invflags & EBT_IP6_PROTO) return -EINVAL; if (info->protocol != IPPROTO_TCP && info->protocol != IPPROTO_UDP && info->protocol != IPPROTO_UDPLITE && info->protocol != IPPROTO_SCTP && info->protocol != IPPROTO_DCCP) return -EINVAL; } if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1]) return -EINVAL; if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; if (info->bitmask & EBT_IP6_ICMP6) { if ((info->invflags & EBT_IP6_PROTO) || info->protocol != IPPROTO_ICMPV6) return -EINVAL; if (info->icmpv6_type[0] > info->icmpv6_type[1] || info->icmpv6_code[0] > info->icmpv6_code[1]) return -EINVAL; } return 0; } static struct xt_match ebt_ip6_mt_reg __read_mostly = { .name = "ip6", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_ip6_mt, .checkentry = ebt_ip6_mt_check, .matchsize = sizeof(struct ebt_ip6_info), .me = THIS_MODULE, }; static int __init ebt_ip6_init(void) { return xt_register_match(&ebt_ip6_mt_reg); } static void __exit ebt_ip6_fini(void) { xt_unregister_match(&ebt