Total coverage: 99096 (6%)of 1847788
19868 21819 840 22 21819 21860 21817 21819 21827 857 859 123 112 112 124 112 112 19856 19842 19840 19856 19842 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <trace/events/cgroup.h> static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); /* * Determines whether a given css can participate in rstat. * css's that are cgroup::self use rstat for base stats. * Other css's associated with a subsystem use rstat only when * they define the ss->css_rstat_flush callback. */ static inline bool css_uses_rstat(struct cgroup_subsys_state *css) { return css_is_self(css) || css->ss->css_rstat_flush != NULL; } static struct css_rstat_cpu *css_rstat_cpu( struct cgroup_subsys_state *css, int cpu) { return per_cpu_ptr(css->rstat_cpu, cpu); } static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); } static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) { if (ss) return &ss->rstat_ss_lock; return &rstat_base_lock; } static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { if (ss) return per_cpu_ptr(ss->lhead, cpu); return per_cpu_ptr(&rstat_backlog_list, cpu); } /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a * memory barrier is needed before the call to css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct css_rstat_cpu __percpu *rstatc_pcpu; struct llist_node *self; /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; lockdep_assert_preemption_disabled(); /* * For archs withnot nmi safe cmpxchg or percpu ops support, ignore * the requests from nmi context. */ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) return; rstatc = css_rstat_cpu(css, cpu); /* * If already on list return. This check is racy and smp_mb() is needed * to pair it with the smp_mb() in css_process_update_tree() if the * guarantee that the updated stats are visible to concurrent flusher is * needed. */ if (llist_on_list(&rstatc->lnode)) return; /* * This function can be renentered by irqs and nmis for the same cgroup * and may try to insert the same per-cpu lnode into the llist. Note * that llist_add() does not protect against such scenarios. * * To protect against such stacked contexts of irqs/nmis, we use the * fact that lnode points to itself when not on a list and then use * this_cpu_cmpxchg() to atomically set to NULL to select the winner * which will call llist_add(). The losers can assume the insertion is * successful and the winner will eventually add the per-cpu lnode to * the llist. */ self = &rstatc->lnode; rstatc_pcpu = css->rstat_cpu; if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; lhead = ss_lhead_cpu(css->ss, cpu); llist_add(&rstatc->lnode, lhead); } static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); struct cgroup_subsys_state *parent = css->parent; struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ if (rstatc->updated_next) break; /* Root has no parent to link it to, but mark it busy */ if (!parent) { rstatc->updated_next = css; break; } prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = css; css = parent; } } static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) { struct llist_head *lhead = ss_lhead_cpu(ss, cpu); struct llist_node *lnode; while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; /* * smp_mb() is needed here (more specifically in between * init_llist_node() and per-cpu stats flushing) if the * guarantee is required by a rstat user where etiher the * updater should add itself on the lockless list or the * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before * css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add * smp_mb() here. */ rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } } /** * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu * Return: A new singly linked list of css's to be flushed * * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children * into a singly linked list via the rstat_flush_next pointer built from the * tail backward like "pushing" css's into a stack. The root is pushed by * the caller. */ static struct cgroup_subsys_state *css_rstat_push_children( struct cgroup_subsys_state *head, struct cgroup_subsys_state *child, int cpu) { struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ struct cgroup_subsys_state *parent, *grandchild; struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; /* * The subsystem rstat lock must be held for the whole duration from * here as the rstat_flush_next list is being constructed to when * it is consumed later in css_rstat_flush(). */ lockdep_assert_held(ss_rstat_lock(head->ss)); /* * Notation: -> updated_next pointer * => rstat_flush_next pointer * * Assuming the following sample updated_children lists: * P: C1 -> C2 -> P * C1: G11 -> G12 -> C1 * C2: G21 -> G22 -> C2 * * After 1st iteration: * head => C2 => C1 => NULL * ghead => G21 => G11 => NULL * * After 2nd iteration: * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL */ next_level: while (cnext) { child = cnext; cnext = child->rstat_flush_next; parent = child->parent; /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ crstatc->updated_children = child; grandchild->rstat_flush_next = ghead; ghead = grandchild; } child = crstatc->updated_next; crstatc->updated_next = NULL; } } if (ghead) { cnext = ghead; ghead = NULL; goto next_level; } return head; } /** * css_rstat_updated_list - build a list of updated css's to be flushed * @root: root of the css subtree to traverse * @cpu: target cpu * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of * child css's if not empty. Whereas updated_next is like a sibling link * within the children list and terminated by the parent css. An exception * here is the css root whose updated_next can be self terminated. */ static struct cgroup_subsys_state *css_rstat_updated_list( struct cgroup_subsys_state *root, int cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) return NULL; /* * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ parent = root->parent; if (parent) { struct css_rstat_cpu *prstatc; struct cgroup_subsys_state **nextp; prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { struct css_rstat_cpu *nrstatc; nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } *nextp = rstatc->updated_next; } rstatc->updated_next = NULL; /* Push @root to the list first before pushing the children */ head = root; root->rstat_flush_next = NULL; child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. * Together with providing bpf kfuncs for css_rstat_updated() and * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away * the function. A global noinline declaration will keep the definition, but may * optimize away the callsite. Therefore, __weak is needed to ensure that the * call is still emitted, by telling the compiler that we don't know what the * function might eventually be. */ __bpf_hook_start(); __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, struct cgroup *parent, int cpu) { } __bpf_hook_end(); /* * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock * was released and re-taken when collection data from the CPUs. The * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ static inline void __css_rstat_lock(struct cgroup_subsys_state *css, int cpu_in_loop) __acquires(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; bool contended; lock = ss_rstat_lock(css->ss); contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, int cpu_in_loop) __releases(ss_rstat_lock(css->ss)) { struct cgroup *cgrp = css->cgroup; spinlock_t *lock; lock = ss_rstat_lock(css->ss); trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); spin_unlock_irq(lock); } /** * css_rstat_flush - flush stats in @css's rstat subtree * @css: target cgroup subsystem state * * Collect all per-cpu stats in @css's subtree into the global counters * and propagate them upwards. After this function returns, all rstat * nodes in the subtree have up-to-date ->stat. * * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { int cpu; bool is_self = css_is_self(css); /* * Since bpf programs can call this function, prevent access to * uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; might_sleep(); for_each_possible_cpu(cpu) { struct cgroup_subsys_state *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ __css_rstat_lock(css, cpu); pos = css_rstat_updated_list(css, cpu); for (; pos; pos = pos->rstat_flush_next) { if (is_self) { cgroup_base_stat_flush(pos->cgroup, cpu); bpf_rstat_flush(pos->cgroup, cgroup_parent(pos->cgroup), cpu); } else pos->ss->css_rstat_flush(pos, cpu); } __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } int css_rstat_init(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; int cpu; bool is_self = css_is_self(css); if (is_self) { /* the root cgrp has rstat_base_cpu preallocated */ if (!cgrp->rstat_base_cpu) { cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); if (!cgrp->rstat_base_cpu) return -ENOMEM; } } else if (css->ss->css_rstat_flush == NULL) return 0; /* the root cgrp's self css has rstat_cpu preallocated */ if (!css->rstat_cpu) { css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); if (!css->rstat_cpu) { if (is_self) free_percpu(cgrp->rstat_base_cpu); return -ENOMEM; } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); rstatc->owner = rstatc->updated_children = css; init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); u64_stats_init(&rstatbc->bsync); } } return 0; } void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; if (!css_uses_rstat(css)) return; if (!css->rstat_cpu) return; css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } if (css_is_self(css)) { struct cgroup *cgrp = css->cgroup; free_percpu(cgrp->rstat_base_cpu); cgrp->rstat_base_cpu = NULL; } free_percpu(css->rstat_cpu); css->rstat_cpu = NULL; } /** * ss_rstat_init - subsystem-specific rstat initialization * @ss: target subsystem * * If @ss is NULL, the static locks associated with the base stats * are initialized. If @ss is non-NULL, the subsystem-specific locks * are initialized. */ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; if (ss) { ss->lhead = alloc_percpu(struct llist_head); if (!ss->lhead) return -ENOMEM; } spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); return 0; } /* * Functions for cgroup basic resource statistics implemented on top of * rstat. */ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum += src_bstat->forceidle_sum; #endif dst_bstat->ntime += src_bstat->ntime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; #ifdef CONFIG_SCHED_CORE dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; #endif dst_bstat->ntime -= src_bstat->ntime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatbc->bsync); delta = rstatbc->bstat; } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatbc->last_bstat, &delta); cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); delta = rstatbc->subtree_bstat; prstatbc = cgroup_rstat_base_cpu(parent, cpu); cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_base_cpu *rstatbc; rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) { struct task_cputime *cputime = &bstat->cputime; int i; memset(bstat, 0, sizeof(*bstat)); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u64 user = 0; u64 sys = 0; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; user += cpustat[CPUTIME_NICE]; cputime->utime += user; sys += cpustat[CPUTIME_SYSTEM]; sys += cpustat[CPUTIME_IRQ]; sys += cpustat[CPUTIME_SOFTIRQ]; cputime->stime += sys; cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; #endif bstat->ntime += cpustat[CPUTIME_NICE]; } } static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) { #ifdef CONFIG_SCHED_CORE u64 forceidle_time = bstat->forceidle_sum; do_div(forceidle_time, NSEC_PER_USEC); seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); #endif } void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { css_rstat_flush(&cgrp->self); __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); do_div(bstat.cputime.utime, NSEC_PER_USEC); do_div(bstat.cputime.stime, NSEC_PER_USEC); do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", bstat.cputime.sum_exec_runtime, bstat.cputime.utime, bstat.cputime.stime, bstat.ntime); cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) BTF_ID_FLAGS(func, css_rstat_updated) BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_rstat_kfunc_ids, }; static int __init bpf_rstat_kfunc_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_rstat_kfunc_set); } late_initcall(bpf_rstat_kfunc_init);
49 49 49 49 49 49 49 49 49 49 52 45 49 49 49 32 106 107 86 32 67 67 40 40 80 41 37 80 80 53 52 54 36 35 41 80 80 80 80 80 41 41 80 79 41 41 80 67 67 67 162 1 161 161 41 41 41 41 41 41 21 21 21 8 15 16 16 16 216 1 217 218 218 218 218 218 217 218 80 80 80 80 80 80 80 41 53 44 80 217 218 218 217 218 218 20 20 19 252 251 252 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> #include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && ((mptcp_pm_is_userspace(msk) && mptcp_userspace_pm_active(msk)) || READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); subflow_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; } #endif mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (unlikely(listener->pm_listener)) return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } else if (unlikely(listener->pm_listener)) { return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) { const struct mptcp_ext *mpext = mptcp_get_ext(skb); if (!mpext) return SK_RST_REASON_NOT_SPECIFIED; return sk_rst_convert_mptcp_reason(mpext->reset_reason); } static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_request_sock *ireq = inet_rsk(req); /* clear tstamp_ok, as needed depending on cookie */ if (foc && foc->len > -1) ireq->tstamp_ok = 0; if (synack_type == TCP_SYNACK_FASTOPEN) mptcp_fastopen_subflow_synack_set_params(subflow, req); } static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* mptcp_mp_fail_no_response() can reach here on an already closed * socket */ if (ssk->sk_state == TCP_CLOSE) return; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_sync_state(struct sock *sk, int state) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk = msk->first; subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, * even for the FASTOPEN scenarios */ WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_set_state(sk, state); sk->sk_state_change(sk); } } static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time */ if (subflow->remote_key_valid) return; subflow->remote_key_valid = 1; subflow->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; /* for fallback's sake */ subflow->map_seq = subflow->iasn; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(sk); mptcp_data_lock(sk); if (mp_opt) { /* Options are available only in the non fallback cases * avoid updating rx path fields otherwise */ WRITE_ONCE(msk->snd_una, subflow->idsn + 1); WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); subflow_set_remote_key(msk, subflow, mp_opt); } if (!sock_owned_by_user(sk)) { __mptcp_sync_state(sk, ssk->sk_state); } else { msk->pending_state = ssk->sk_state; __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); } mptcp_data_unlock(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; msk = mptcp_sk(parent); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { if (!mptcp_try_fallback(sk, MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FALLBACKFAILED); goto do_reset; } goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow->backup) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { /* It looks like MPTCP is blocked, while TCP is not */ if (subflow->mpc_drop) mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { WARN_ON_ONCE(local_id < 0 || local_id > 255); WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); if (err < 0) return err; subflow_set_local_id(subflow, err); subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); return 0; } static int subflow_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet_sk_rebuild_header(sk); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet6_sk_rebuild_header(sk); } #endif static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; list_del(&mptcp_subflow_ctx(ssk)->node); if (inet_csk(ssk)->icsk_ulp_ops) { subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); } kfree_rcu(ctx, rcu); } void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); if (mp_opt.deny_join_id0) WRITE_ONCE(owner->pm.remote_deny_join_id0, true); mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!mptcp_can_accept_new_subflow(owner)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } /* check for expected invariant - should never trigger, just help * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); reason = mptcp_get_rst_reason(skb); req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; fallback: if (fallback) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, MAPPING_BAD_CSUM, MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (unlikely(skb_consumed >= skb->len)) { DEBUG_NET_WARN_ON_ONCE(1); return true; } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) return MAPPING_NODSS; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } if (mpext->data_fin == 1) { u64 data_fin_seq; if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } if (updated) mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq must also * be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; struct tcp_sock *tp = tcp_sk(ssk); u32 offset, incr, avail_len; offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(offset > skb->len)) goto out; avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } /* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && (ssk->sk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (!skb_queue_empty(&ssk->sk_receive_queue)) return; if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) mptcp_schedule_work(sk); /* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along */ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && msk->first == ssk && mptcp_update_rcv_data_fin(msk, subflow->map_seq + subflow->map_data_len, true)) mptcp_schedule_work(sk); } static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; /* we are really failing, prevent any later subflow join */ spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); return false; } msk->allow_subflows = false; spin_unlock_bh(&msk->fallback_lock); /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return false; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) return true; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; */ fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout) fail_tout = 1; WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); return true; } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, true); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ if (status == MAPPING_BAD_CSUM && (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; if (!mptcp_subflow_fail(msk, ssk)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } WRITE_ONCE(subflow->data_avail, true); return true; } if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; subflow->reset_reason = status == MAPPING_NODSS ? MPTCP_RST_EMIDDLEBOX : MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, subflow->iasn + TCP_SKB_CB(skb)->seq - subflow->ssn_offset - 1); WRITE_ONCE(subflow->data_avail, true); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; trace_sk_data_ready(sk); msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) { mptcp_data_ready(parent, sk); /* subflow-level lowat test are not relevant. * respect the msk-level threshold eventually mandating an immediate ack */ if (mptcp_data_avail(msk) < parent->sk_rcvlowat && (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } else if (unlikely(sk->sk_err)) { subflow_error_report(sk); } } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", msk, local_id, remote_id, err); goto err_out; } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); /* if 'IPADDRANY', the ID will be set later, after the routing */ if (local->addr.family == AF_INET) { if (!local->addr.addr.s_addr) local_id = -1; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&local->addr.addr6)) local_id = -1; #endif } if (local_id >= 0) subflow_set_local_id(subflow, local_id); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", msk, local_id, remote_id, err); goto failed; } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", msk, local_id, remote_id, err); goto failed_unlink; } MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); mptcp_stop_tout_timer(sk); return 0; failed_unlink: list_del(&subflow->node); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change() */ mptcp_pm_close_subflow(msk); return err; } void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) { /* Only if the msk has been accepted already (and not orphaned).*/ if (!mem_cgroup_sockets_enabled || !sk->sk_socket) return; mem_cgroup_sk_inherit(sk, ssk); __sk_charge(ssk, gfp); } void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, *ssk_cd = &ssk->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(sk_cd)) != cgroup_id(sock_cgroup_ptr(ssk_cd))) { cgroup_sk_free(ssk_cd); *ssk_cd = *sk_cd; cgroup_sk_clone(sk_cd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { __mptcp_inherit_cgrp_data(parent, child); if (mem_cgroup_sockets_enabled) mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto err_free; /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; err_free: release_sock(sf->sk); sock_release(sf); return err; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc(sizeof(*ctx), priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; __subflow_state_change(sk); if (subflow_simultaneous_connect(sk)) { WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk, subflow, NULL); } /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct request_sock *req, *head, *tail; struct mptcp_subflow_context *subflow; struct sock *sk, *ssk; /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below. */ spin_lock_bh(&queue->rskq_lock); head = queue->rskq_accept_head; tail = queue->rskq_accept_tail; queue->rskq_accept_head = NULL; queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); if (!head) return; /* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock */ release_sock(listener_ssk); for (req = head; req; req = req->dl_next) { ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue; sk = subflow->conn; sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here. */ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); sock_put(sk); } /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); /* restore the listener queue, to let the TCP code clean it up */ spin_lock_bh(&queue->rskq_lock); WARN_ON_ONCE(queue->rskq_accept_head); queue->rskq_accept_head = head; queue->rskq_accept_tail = tail; spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_error_report = sk->sk_error_report; WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); /* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery */ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; /* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request() */ subflow_set_local_id(new_ctx, subflow_req->local_id); } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); long status; /* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it. */ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status) mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } static int tcp_abort_override(struct sock *ssk, int err) { /* closing a listener subflow requires a great deal of care. * keep it simple and just prevent such operation */ if (inet_sk_state_load(ssk) == TCP_LISTEN) return -EINVAL; return tcp_abort(ssk, err); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; #ifdef CONFIG_BPF_SYSCALL /* Disable sockmap processing for subflows */ tcp_prot_override.psock_update_sk_prot = NULL; #endif #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; #ifdef CONFIG_BPF_SYSCALL /* Disable sockmap processing for subflows */ tcpv6_prot_override.psock_update_sk_prot = NULL; #endif #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); }
1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-only /* * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): * Murali Karicheri <m-karicheri2@ti.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/debugfs.h> #include "hsr_main.h" #include "hsr_framereg.h" static struct dentry *hsr_debugfs_root_dir; /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); seq_puts(sfp, "time_in[B], Address-B port, "); if (priv->prot_version == PRP_V1) seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); else seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; seq_printf(sfp, "%pM ", &node->macaddress_A[0]); seq_printf(sfp, "%pM ", &node->macaddress_B[0]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); seq_printf(sfp, "%14x, ", node->addr_B_port); if (priv->prot_version == PRP_V1) seq_printf(sfp, "%5x, %5x, %5x\n", node->san_a, node->san_b, (node->san_a == 0 && node->san_b == 0)); else seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); int err; err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name); if (err) netdev_warn(dev, "failed to rename\n"); } /* hsr_debugfs_init - create hsr node_table file for dumping * the node table * * Description: * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs directory\n"); return; } priv->node_tbl_root = de; de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_node_table_fops); if (IS_ERR(de)) { pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; } } /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void hsr_debugfs_term(struct hsr_priv *priv) { debugfs_remove_recursive(priv->node_tbl_root); priv->node_tbl_root = NULL; } void hsr_debugfs_create_root(void) { hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); if (IS_ERR(hsr_debugfs_root_dir)) { pr_err("Cannot create hsr debugfs root directory\n"); hsr_debugfs_root_dir = NULL; } } void hsr_debugfs_remove_root(void) { /* debugfs_remove() internally checks NULL and ERROR */ debugfs_remove(hsr_debugfs_root_dir); }
24 18 12 12 15 243 243 52 290 290 289 12 12 12 12 12 12 12 12 290 290 12 11 273 273 274 11 11 11 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> #include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/bug.h> #include "internal.h" static void proc_evict_inode(struct inode *inode) { struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ if (ei->pid) proc_pid_evict_inode(ei); head = ei->sysctl; if (head) { WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } static void proc_free_inode(struct inode *inode) { struct proc_inode *ei = PROC_I(inode); if (ei->pid) put_pid(ei->pid); /* Let go of any associated proc directory entry */ if (ei->pde) pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; inode_init_once(&ei->vfs_inode); } void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, offsetof(struct proc_dir_entry, inline_name), SIZEOF_PDE_INLINE_NAME, NULL); BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); while ((node = hlist_first_rcu(inodes))) { struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; struct inode *inode; spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); inode = &ei->vfs_inode; sb = inode->i_sb; if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) continue; inode = igrab(inode); rcu_read_unlock(); if (sb != old_sb) { if (old_sb) deactivate_super(old_sb); old_sb = sb; } if (unlikely(!inode)) { rcu_read_lock(); continue; } if (S_ISDIR(inode->i_mode)) { struct dentry *dir = d_find_any_alias(inode); if (dir) { d_invalidate(dir); dput(dir); } } else { struct dentry *dentry; while ((dentry = d_find_alias(inode))) { d_invalidate(dentry); dput(dentry); } } iput(inode); rcu_read_lock(); } rcu_read_unlock(); if (old_sb) deactivate_super(old_sb); } static inline const char *hidepid2str(enum proc_hidepid v) { switch (v) { case HIDEPID_OFF: return "off"; case HIDEPID_NO_ACCESS: return "noaccess"; case HIDEPID_INVISIBLE: return "invisible"; case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; } WARN_ONCE(1, "bad hide_pid value: %d\n", v); return "unknown"; } static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); if (fs_info->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); if (fs_info->pidonly != PROC_PIDONLY_OFF) seq_printf(seq, ",subset=pid"); return 0; } const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, }; enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } /* * At most 2 contexts can enter this function: the one doing the last * close on the descriptor and whoever is deleting PDE itself. * * First to enter calls ->proc_release hook and signals its completion * to the second one which waits and then does nothing. * * PDE is locked on entry, unlocked on exit. */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: * ->release hook needs to be available at the right moment. * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); } else { struct file *file; struct completion *c; pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); kmem_cache_free(pde_opener_cache, pdeo); } } void proc_entry_rundown(struct proc_dir_entry *de) { DECLARE_COMPLETION_ONSTACK(c); /* Wait until all existing callers into module are done. */ de->pde_unload_completion = &c; if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); /* ->pde_openers list can't grow from now on. */ spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; } static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); ssize_t ret; if (pde_is_permanent(pde)) return pde->proc_ops->proc_read_iter(iocb, iter); if (!use_pde(pde)) return -EIO; ret = pde->proc_ops->proc_read_iter(iocb, iter); unuse_pde(pde); return ret; } static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { __auto_type read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_read(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { __auto_type write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_write(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { __auto_type poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; } static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (pde_is_permanent(pde)) { return pde_poll(pde, file, pts); } else if (use_pde(pde)) { rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_compat_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { __auto_type mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; } static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (pde_is_permanent(pde)) { return pde_mmap(pde, file, vma); } else if (use_pde(pde)) { rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (pde->proc_ops->proc_get_unmapped_area) return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif return orig_addr; } static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; if (pde_is_permanent(pde)) { return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); } else if (use_pde(pde)) { rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; } static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; if (!pde_has_proc_lseek(pde)) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); return rv; } /* * Ensure that * 1) PDE's ->release hook will be called no matter what * either normally by close()/->release, or forcefully by * rmmod/remove_proc_entry. * * 2) rmmod isn't blocked by opening file in /proc and sitting on * the descriptor (including "rmmod foo </proc/foo" scenario). * * Save every "struct file" with custom ->release hook. */ if (!use_pde(pde)) return -ENOENT; __auto_type release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; } } open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); if (release) { if (rv == 0) { /* To know what to release. */ pdeo->file = file; pdeo->closing = false; pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: unuse_pde(pde); return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; if (pde_is_permanent(pde)) { __auto_type release = pde->proc_ops->proc_release; if (release) { return release(inode, file); } return 0; } spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); return 0; } } spin_unlock(&pde->pde_unload_lock); return 0; } static const struct file_operations proc_reg_file_ops = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #ifdef CONFIG_COMPAT static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #endif static void proc_put_link(void *p) { unuse_pde(p); } static const char *proc_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; } const struct inode_operations proc_link_inode_operations = { .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode(sb); if (!inode) { pde_put(de); return NULL; } inode->i_private = de->data; inode->i_ino = de->low_ino; simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); return inode; } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; inode->i_gid = de->gid; } if (de->size) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT if (pde_has_proc_compat_ioctl(de)) { if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; } #endif } else if (S_ISDIR(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = de->proc_dir_ops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = NULL; } else { BUG(); } return inode; }
423 423 422 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0 /* * x86 single-step support code, common to 32-bit and 64-bit. */ #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/mmu_context.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { unsigned long addr, seg; addr = regs->ip; seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; } #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * We'll assume that the code segments in the GDT * are all zero-based. That is largely true: the * TLS segments are used for data, and the PNPBIOS * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { struct desc_struct *desc; unsigned long base; seg >>= 3; mutex_lock(&child->mm->context.lock); if (unlikely(!child->mm->context.ldt || seg >= child->mm->context.ldt->nr_entries)) addr = -1L; /* bogus selector, access would fault */ else { desc = &child->mm->context.ldt->entries[seg]; base = get_desc_base(desc); /* 16-bit code segment? */ if (!desc->d) addr &= 0xffff; addr += base; } mutex_unlock(&child->mm->context.lock); } #endif return addr; } static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ /* opcode and address size prefixes */ case 0x66: case 0x67: continue; /* irrelevant prefixes (segment overrides and repeats) */ case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: case 0xf0: case 0xf2: case 0xf3: continue; #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ continue; #endif /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let * the user see the TF bit being set. But * it's more pain than it's worth to avoid * it, and a debugger could emulate this * all in user space if it _really_ cares. */ case 0x9c: default: return 0; } } return 0; } /* * Enable single-stepping. Return nonzero if user mode is not using TF itself. */ static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); unsigned long oflags; /* * If we stepped into a sysenter/syscall insn, it trapped in * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. * If user-mode had set TF itself, then it's still clear from * do_debug() and we need to set it again to restore the user * state so we don't wrongly set TIF_FORCED_TF below. * If enable_single_step() was used last and that is what * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are * already set and our bookkeeping is fine. */ if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) regs->flags |= X86_EFLAGS_TF; /* * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); /* * Ensure that a trap is triggered once stepping out of a system * call prior to executing any user instruction. */ set_task_syscall_work(child, SYSCALL_EXIT_TRAP); oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; /* * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. * * Note that if we don't actually execute the popf because * of a signal arriving right now or suchlike, we will lose * track of the fact that it really was "us" that set it. */ if (is_setting_trap_flag(child, regs)) { clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; } /* * If TF was already set, check whether it was us who set it. * If not, we should never attempt a block step. */ if (oflags & X86_EFLAGS_TF) return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); return 1; } void set_task_blockstep(struct task_struct *task, bool on) { unsigned long debugctl; /* * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); if (on) { debugctl |= DEBUGCTLMSR_BTF; set_tsk_thread_flag(task, TIF_BLOCKSTEP); } else { debugctl &= ~DEBUGCTLMSR_BTF; clear_tsk_thread_flag(task, TIF_BLOCKSTEP); } if (task == current) update_debugctlmsr(debugctl); local_irq_enable(); } /* * Enable single or block step. */ static void enable_step(struct task_struct *child, bool block) { /* * Make sure block stepping (BTF) is not enabled unless it should be. * Note that we don't try to worry about any is_setting_trap_flag() * instructions after the first when using block stepping. * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) set_task_blockstep(child, true); else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) set_task_blockstep(child, false); } void user_enable_single_step(struct task_struct *child) { enable_step(child, 0); } void user_enable_block_step(struct task_struct *child) { enable_step(child, 1); } void user_disable_single_step(struct task_struct *child) { /* * Make sure block stepping (BTF) is disabled. */ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) set_task_blockstep(child, false); /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; }
13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(&current->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; fallthrough; case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; }
75 75 73 1 37 15 27 27 2 15 14 31 31 66 97 97 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder. * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * * Fixes: * Alan Cox : Merged and made usable non modular (its so tiny its silly as * a module taking up 2 pages). * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL * David Woodhouse : Perform some basic ICMP handling. * IPIP Routing without decapsulation. * Carlos Picoto : GRE over IP support * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. * I do not want to merge them together. */ /* tunnel.c: an IP tunnel driver The purpose of this driver is to provide an IP tunnel through which you can tunnel network traffic transparently across subnets. This was written by looking at Nick Holloway's dummy driver Thanks for the great code! -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 Minor tweaks: Cleaned up the code a little and added some pre-1.3.0 tweaks. dev->hard_header/hard_header_len changed to use no headers. Comments/bracketing tweaked. Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the tunnel's pointopoint address Almost completely rewritten Note: There is currently no firewall or ICMP handling done. -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 */ /* Things I wish I had known when writing the tunnel driver: When the tunnel_xmit() function is called, the skb contains the packet to be sent (plus a great deal of extra info), and dev contains the tunnel device that _we_ are. When we are passed a packet, we are expected to fill in the source address with our source IP address. What is the proper way to allocate, copy and free a buffer? After you allocate it, it is a "0 length" chunk of memory starting at zero. If you want to add headers to the buffer later, you'll have to call "skb_reserve(skb, amount)" with the amount of memory you want reserved. Then, you call "skb_put(skb, amount)" with the amount of space you want in the buffer. skb_put() returns a pointer to the top (#0) of that buffer. skb->len is set to the amount of space you have "allocated" with skb_put(). You can then write up to skb->len bytes to that buffer. If you need more, you can call skb_put() again with the additional amount of space you need. You can find out how much more space you can allocate by calling "skb_tailroom(skb)". Now, to add header space, call "skb_push(skb, header_len)". This creates space at the beginning of the buffer and returns a pointer to this new space. If later you need to strip a header from a buffer, call "skb_pull(skb, header_len)". skb_headroom() will return how much space is left at the top of the buffer (before the main data). Remember, this headroom space must be reserved before the skb_put() function is called. */ /* This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!t) { err = -ENOENT; goto out; } switch (type) { case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ goto out; default: /* All others are translated to HOST_UNREACH. * rfc2003 contains "deep thoughts" about NET_UNREACH, * I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) goto out; break; case ICMP_REDIRECT: break; default: goto out; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } if (t->parms.iph.daddr == 0) { err = -ENOENT; goto out; } if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { ip_tunnel_flags_zero(flags); tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); } skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; u8 ipproto; if (!pskb_inet_may_pull(skb)) goto tx_error; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; #endif default: goto tx_error; } if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) { switch (ipproto) { case 0: case IPPROTO_IPIP: #if IS_ENABLED(CONFIG_MPLS) case IPPROTO_MPLS: #endif return true; } return false; } static int ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } p->i_key = p->o_key = 0; ip_tunnel_flags_zero(p->i_flags); ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct ip_tunnel *tunnel = netdev_priv(ctx->dev); const struct iphdr *tiph = &tunnel->parms.iph; struct rtable *rt; rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return PTR_ERR(rt); path->type = DEV_PATH_TUN; path->tun.src_v4.s_addr = tiph->saddr; path->tun.dst_v4.s_addr = tiph->daddr; path->tun.l3_proto = IPPROTO_IPIP; path->dev = ctx->dev; ctx->dev = rt->dst.dev; ip_rt_put(rt); return 0; } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; return ip_tunnel_init(dev); } static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) return -EINVAL; return 0; } static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; *collect_md = false; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ipip_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { .kind = "ipip", .maxtype = IFLA_IPTUN_MAX, .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, .get_size = ipip_get_size, .fill_info = ipip_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip_err, .priority = 1, }; #endif static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_ipip_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_mplsip_failed; } #endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } static void __exit ipip_fini(void) { rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); #if IS_ENABLED(CONFIG_MPLS) if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) pr_info("%s: can't deregister tunnel\n", __func__); #endif unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0");
596 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
21 21 20 21 189 191 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI sockets. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/utsname.h> #include <linux/sched.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); static DEFINE_IDA(sock_cookie_ida); static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ /* Socket info */ #define hci_pi(sk) ((struct hci_pinfo *) sk) struct hci_pinfo { struct bt_sock bt; struct hci_dev *hdev; struct hci_filter filter; __u8 cmsg_mask; unsigned short channel; unsigned long flags; __u32 cookie; char comm[TASK_COMM_LEN]; __u16 mtu; }; static struct hci_dev *hci_hdev_from_sock(struct sock *sk) { struct hci_dev *hdev = hci_pi(sk)->hdev; if (!hdev) return ERR_PTR(-EBADFD); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) return ERR_PTR(-EPIPE); return hdev; } void hci_sock_set_flag(struct sock *sk, int nr) { set_bit(nr, &hci_pi(sk)->flags); } void hci_sock_clear_flag(struct sock *sk, int nr) { clear_bit(nr, &hci_pi(sk)->flags); } int hci_sock_test_flag(struct sock *sk, int nr) { return test_bit(nr, &hci_pi(sk)->flags); } unsigned short hci_sock_get_channel(struct sock *sk) { return hci_pi(sk)->channel; } u32 hci_sock_get_cookie(struct sock *sk) { return hci_pi(sk)->cookie; } static bool hci_sock_gen_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (!id) { id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL); if (id < 0) id = 0xffffffff; hci_pi(sk)->cookie = id; get_task_comm(hci_pi(sk)->comm, current); return true; } return false; } static void hci_sock_free_cookie(struct sock *sk) { int id = hci_pi(sk)->cookie; if (id) { hci_pi(sk)->cookie = 0; ida_free(&sock_cookie_ida, id); } } static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); } /* Security filter */ #define HCI_SFLT_MAX_OGF 5 struct hci_sec_filter { __u32 type_mask; __u32 event_mask[2]; __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4]; }; static const struct hci_sec_filter hci_sec_filter = { /* Packet types */ 0x10, /* Events */ { 0x1000d9fe, 0x0000b00c }, /* Commands */ { { 0x0 }, /* OGF_LINK_CTL */ { 0xbe000006, 0x00000001, 0x00000000, 0x00 }, /* OGF_LINK_POLICY */ { 0x00005200, 0x00000000, 0x00000000, 0x00 }, /* OGF_HOST_CTL */ { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 }, /* OGF_INFO_PARAM */ { 0x000002be, 0x00000000, 0x00000000, 0x00 }, /* OGF_STATUS_PARAM */ { 0x000000ea, 0x00000000, 0x00000000, 0x00 } } }; static struct bt_sock_list hci_sk_list = { .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock) }; static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb) { struct hci_filter *flt; int flt_type, flt_event; /* Apply filter */ flt = &hci_pi(sk)->filter; flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS; if (!test_bit(flt_type, &flt->type_mask)) return true; /* Extra filter for event packets only */ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT) return false; flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); if (!hci_test_bit(flt_event, &flt->event_mask)) return true; /* Check filter only when opcode is set */ if (!flt->opcode) return false; if (flt_event == HCI_EV_CMD_COMPLETE && flt->opcode != get_unaligned((__le16 *)(skb->data + 3))) return true; if (flt_event == HCI_EV_CMD_STATUS && flt->opcode != get_unaligned((__le16 *)(skb->data + 4))) return true; return false; } /* Send frame to RAW socket */ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) { struct sock *sk; struct sk_buff *skb_copy = NULL; BT_DBG("hdev %p len %d", hdev, skb->len); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev) continue; /* Don't send frame to the socket it came from */ if (skb->sk == sk) continue; if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) { if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { if (!bt_cb(skb)->incoming) continue; if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && hci_skb_pkt_type(skb) != HCI_DRV_PKT) continue; } else { /* Don't send frame to other channel types */ continue; } if (!skb_copy) { /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true); if (!skb_copy) continue; /* Put type byte before the data */ memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&hci_sk_list.lock); kfree_skb(skb_copy); } static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb) { struct scm_creds *creds; if (!sk || WARN_ON(!skb)) return; creds = &bt_cb(skb)->creds; /* Check if peer credentials is set */ if (!sk->sk_peer_pid) { /* Check if parent peer credentials is set */ if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid) sk = bt_sk(sk)->parent; else return; } /* Check if scm_creds already set */ if (creds->pid == pid_vnr(sk->sk_peer_pid)) return; memset(creds, 0, sizeof(*creds)); creds->pid = pid_vnr(sk->sk_peer_pid); if (sk->sk_peer_cred) { creds->uid = sk->sk_peer_cred->uid; creds->gid = sk->sk_peer_cred->gid; } } static struct sk_buff *hci_skb_clone(struct sk_buff *skb) { struct sk_buff *nskb; if (!skb) return NULL; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return NULL; hci_sock_copy_creds(skb->sk, nskb); return nskb; } /* Send frame to sockets with specific channel */ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct sock *sk; BT_DBG("channel %u len %d", channel, skb->len); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; if (sk->sk_state != BT_BOUND) continue; if (hci_pi(sk)->channel != channel) continue; nskb = hci_skb_clone(skb); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } } void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { read_lock(&hci_sk_list.lock); __hci_send_to_channel(channel, skb, flag, skip_sk); read_unlock(&hci_sk_list.lock); } /* Send frame to monitor socket */ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) { struct sk_buff *skb_copy = NULL; struct hci_mon_hdr *hdr; __le16 opcode; if (!atomic_read(&monitor_promisc)) return; BT_DBG("hdev %p len %d", hdev, skb->len); switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: opcode = cpu_to_le16(HCI_MON_COMMAND_PKT); break; case HCI_EVENT_PKT: opcode = cpu_to_le16(HCI_MON_EVENT_PKT); break; case HCI_ACLDATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT); break; case HCI_SCODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; case HCI_ISODATA_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT); else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; case HCI_DRV_PKT: if (bt_cb(skb)->incoming) opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT); else opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT); break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; default: return; } /* Create a private copy with headroom */ skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, GFP_ATOMIC, true); if (!skb_copy) return; hci_sock_copy_creds(skb->sk, skb_copy); /* Put header before the data */ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb_copy); } void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, void *data, u16 data_len, ktime_t tstamp, int flag, struct sock *skip_sk) { struct sock *sk; __le16 index; if (hdev) index = cpu_to_le16(hdev->id); else index = cpu_to_le16(MGMT_INDEX_NONE); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct hci_mon_hdr *hdr; struct sk_buff *skb; if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) continue; /* Ignore socket without the flag set */ if (!hci_sock_test_flag(sk, flag)) continue; /* Skip the original socket */ if (sk == skip_sk) continue; skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); if (!skb) continue; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(event, skb_put(skb, 2)); if (data) skb_put_data(skb, data, data_len); skb->tstamp = tstamp; hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; struct hci_mon_new_index *ni; struct hci_mon_index_info *ii; struct sk_buff *skb; __le16 opcode; switch (event) { case HCI_DEV_REG: skb = bt_skb_alloc(HCI_MON_NEW_INDEX_SIZE, GFP_ATOMIC); if (!skb) return NULL; ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE); ni->type = 0x00; /* Old hdev->dev_type */ ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; case HCI_DEV_UNREG: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_DEL_INDEX); break; case HCI_DEV_SETUP: if (hdev->manufacturer == 0xffff) return NULL; fallthrough; case HCI_DEV_UP: skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); if (!skb) return NULL; ii = skb_put(skb, HCI_MON_INDEX_INFO_SIZE); bacpy(&ii->bdaddr, &hdev->bdaddr); ii->manufacturer = cpu_to_le16(hdev->manufacturer); opcode = cpu_to_le16(HCI_MON_INDEX_INFO); break; case HCI_DEV_OPEN: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_OPEN_INDEX); break; case HCI_DEV_CLOSE: skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) return NULL; opcode = cpu_to_le16(HCI_MON_CLOSE_INDEX); break; default: return NULL; } __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; u16 format; u8 ver[3]; u32 flags; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: format = 0x0000; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_USER: format = 0x0001; ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; case HCI_CHANNEL_CONTROL: format = 0x0002; mgmt_fill_version_info(ver); break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(format, skb_put(skb, 2)); skb_put_data(skb, ver, sizeof(ver)); put_unaligned_le32(flags, skb_put(skb, 4)); skb_put_u8(skb, TASK_COMM_LEN); skb_put_data(skb, hci_pi(sk)->comm, TASK_COMM_LEN); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; /* No message needed when cookie is not present */ if (!hci_pi(sk)->cookie) return NULL; switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: break; default: /* No message for unsupported format */ return NULL; } skb = bt_skb_alloc(4, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); if (hci_pi(sk)->hdev) hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); else hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, u16 opcode, u16 len, const void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; hci_sock_copy_creds(sk, skb); put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { size_t len; struct hci_mon_hdr *hdr; struct sk_buff *skb; va_list args; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args); va_end(args); skb = bt_skb_alloc(len + 1, GFP_ATOMIC); if (!skb) return; hci_sock_copy_creds(sk, skb); va_start(args, fmt); vsprintf(skb_put(skb, len), fmt, args); *(u8 *)skb_put(skb, 1) = 0; va_end(args); __net_timestamp(skb); hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE); hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } static void send_monitor_replay(struct sock *sk) { struct hci_dev *hdev; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { struct sk_buff *skb; skb = create_monitor_event(hdev, HCI_DEV_REG); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) continue; skb = create_monitor_event(hdev, HCI_DEV_OPEN); if (!skb) continue; if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); if (test_bit(HCI_UP, &hdev->flags)) skb = create_monitor_event(hdev, HCI_DEV_UP); else if (hci_dev_test_flag(hdev, HCI_SETUP)) skb = create_monitor_event(hdev, HCI_DEV_SETUP); else skb = NULL; if (skb) { if (sock_queue_rcv_skb(sk, skb)) kfree_skb(skb); } } read_unlock(&hci_dev_list_lock); } static void send_monitor_control_replay(struct sock *mon_sk) { struct sock *sk; read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *skb; skb = create_monitor_ctrl_open(sk); if (!skb) continue; if (sock_queue_rcv_skb(mon_sk, skb)) kfree_skb(skb); } read_unlock(&hci_sk_list.lock); } /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { struct hci_event_hdr *hdr; struct hci_ev_stack_internal *ev; struct sk_buff *skb; skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC); if (!skb) return; hdr = skb_put(skb, HCI_EVENT_HDR_SIZE); hdr->evt = HCI_EV_STACK_INTERNAL; hdr->plen = sizeof(*ev) + dlen; ev = skb_put(skb, sizeof(*ev) + dlen); ev->type = type; memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; __net_timestamp(skb); hci_skb_pkt_type(skb) = HCI_EVENT_PKT; hci_send_to_sock(hdev, skb); kfree_skb(skb); } void hci_sock_dev_event(struct hci_dev *hdev, int event) { BT_DBG("hdev %s event %d", hdev->name, event); if (atomic_read(&monitor_promisc)) { struct sk_buff *skb; /* Send event to monitor */ skb = create_monitor_event(hdev, event); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (event <= HCI_DEV_DOWN) { struct hci_ev_si_device ev; /* Send event to sockets */ ev.event = event; ev.dev_id = hdev->id; hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); } if (event == HCI_DEV_UNREG) { struct sock *sk; /* Wake up sockets using this dead device */ read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { if (hci_pi(sk)->hdev == hdev) { sk->sk_err = EPIPE; sk->sk_state_change(sk); } } read_unlock(&hci_sk_list.lock); } } static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; list_for_each_entry(c, &mgmt_chan_list, list) { if (c->channel == channel) return c; } return NULL; } static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel) { struct hci_mgmt_chan *c; mutex_lock(&mgmt_chan_list_lock); c = __hci_mgmt_chan_find(channel); mutex_unlock(&mgmt_chan_list_lock); return c; } int hci_mgmt_chan_register(struct hci_mgmt_chan *c) { if (c->channel < HCI_CHANNEL_CONTROL) return -EINVAL; mutex_lock(&mgmt_chan_list_lock); if (__hci_mgmt_chan_find(c->channel)) { mutex_unlock(&mgmt_chan_list_lock); return -EALREADY; } list_add_tail(&c->list, &mgmt_chan_list); mutex_unlock(&mgmt_chan_list_lock); return 0; } EXPORT_SYMBOL(hci_mgmt_chan_register); void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c) { mutex_lock(&mgmt_chan_list_lock); list_del(&c->list); mutex_unlock(&mgmt_chan_list_lock); } EXPORT_SYMBOL(hci_mgmt_chan_unregister); static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; struct sk_buff *skb; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); break; case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_free_cookie(sk); break; } bt_sock_unlink(&hci_sk_list, sk); hdev = hci_pi(sk)->hdev; if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER && !hci_dev_test_flag(hdev, HCI_UNREGISTER)) { /* When releasing a user channel exclusive access, * call hci_dev_do_close directly instead of calling * hci_dev_close to ensure the exclusive access will * be released and the controller brought back down. * * The checking of HCI_AUTO_OFF is not needed in this * case since it will have been cleared already when * opening the user channel. * * Make sure to also check that we haven't already * unregistered since all the cleanup will have already * been complete and hdev will get released when we put * below. */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); hci_dev_put(hdev); } sock_orphan(sk); release_sock(sk); sock_put(sk); return 0; } static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) return -EFAULT; hci_dev_lock(hdev); err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } /* Ioctls that require bound socket */ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { struct hci_dev *hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) return PTR_ERR(hdev); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) return -EPERM; return -EOPNOTSUPP; case HCIGETCONNINFO: return hci_get_conn_info(hdev, (void __user *)arg); case HCIGETAUTHINFO: return hci_get_auth_info(hdev, (void __user *)arg); case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_sock_reject_list_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; } static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; int err; BT_DBG("cmd %x arg %lx", cmd, arg); /* Make sure the cmd is valid before doing anything */ switch (cmd) { case HCIGETDEVLIST: case HCIGETDEVINFO: case HCIGETCONNLIST: case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: case HCIINQUIRY: case HCISETRAW: case HCIGETCONNINFO: case HCIGETAUTHINFO: case HCIBLOCKADDR: case HCIUNBLOCKADDR: break; default: return -ENOIOCTLCMD; } lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } /* When calling an ioctl on an unbound raw socket, then ensure * that the monitor gets informed. Ensure that the resulting event * is only send once by checking if the cookie exists or not. The * socket cookie will be only ever generated once for the lifetime * of a given socket. */ if (hci_sock_gen_cookie(sk)) { struct sk_buff *skb; /* Perform careful checks before setting the HCI_SOCK_TRUSTED * flag. Make sure that not only the current task but also * the socket opener has the required capability, since * privileged programs can be tricked into making ioctl calls * on HCI sockets, and the socket should not be marked as * trusted simply because the ioctl caller is privileged. */ if (sk_capable(sk, CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } release_sock(sk); switch (cmd) { case HCIGETDEVLIST: return hci_get_dev_list(argp); case HCIGETDEVINFO: return hci_get_dev_info(argp); case HCIGETCONNLIST: return hci_get_conn_list(argp); case HCIDEVUP: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_open(arg); case HCIDEVDOWN: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_close(arg); case HCIDEVRESET: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset(arg); case HCIDEVRESTAT: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_reset_stat(arg); case HCISETSCAN: case HCISETAUTH: case HCISETENCRYPT: case HCISETPTYPE: case HCISETLINKPOL: case HCISETLINKMODE: case HCISETACLMTU: case HCISETSCOMTU: if (!capable(CAP_NET_ADMIN)) return -EPERM; return hci_dev_cmd(cmd, argp); case HCIINQUIRY: return hci_inquiry(argp); } lock_sock(sk); err = hci_sock_bound_ioctl(sk, cmd, arg); done: release_sock(sk); return err; } #ifdef CONFIG_COMPAT static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { switch (cmd) { case HCIDEVUP: case HCIDEVDOWN: case HCIDEVRESET: case HCIDEVRESTAT: return hci_sock_ioctl(sock, cmd, arg); } return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_hci haddr; struct sock *sk = sock->sk; struct hci_dev *hdev = NULL; struct sk_buff *skb; int len, err = 0; BT_DBG("sock %p sk %p", sock, sk); if (!addr) return -EINVAL; memset(&haddr, 0, sizeof(haddr)); len = min_t(unsigned int, sizeof(haddr), addr_len); memcpy(&haddr, addr, len); if (haddr.hci_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow detaching from dead device and attaching to alive device, if * the caller wants to re-bind (instead of close) this socket in * response to hci_sock_dev_event(HCI_DEV_UNREG) notification. */ hdev = hci_pi(sk)->hdev; if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) { hci_pi(sk)->hdev = NULL; sk->sk_state = BT_OPEN; hci_dev_put(hdev); } hdev = NULL; if (sk->sk_state == BT_BOUND) { err = -EALREADY; goto done; } switch (haddr.hci_channel) { case HCI_CHANNEL_RAW: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } atomic_inc(&hdev->promisc); } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } break; case HCI_CHANNEL_USER: if (hci_pi(sk)->hdev) { err = -EALREADY; goto done; } if (haddr.hci_dev == HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hdev = hci_dev_get(haddr.hci_dev); if (!hdev) { err = -ENODEV; goto done; } if (test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && test_bit(HCI_UP, &hdev->flags))) { err = -EBUSY; hci_dev_put(hdev); goto done; } if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) { err = -EUSERS; hci_dev_put(hdev); goto done; } hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); err = hci_dev_open(hdev->id); if (err) { if (err == -EALREADY) { /* In case the transport is already up and * running, clear the error here. * * This can happen when opening a user * channel and HCI_AUTO_OFF grace period * is still active. */ err = 0; } else { hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); mgmt_index_added(hdev); hci_dev_put(hdev); goto done; } } hci_pi(sk)->channel = haddr.hci_channel; if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * this socket will transition from a raw socket into * a user channel socket. For a clean transition, send * the close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* The user channel is restricted to CAP_NET_ADMIN * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } atomic_inc(&hdev->promisc); break; case HCI_CHANNEL_MONITOR: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_RAW)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; /* The monitor interface is restricted to CAP_NET_RAW * capabilities and with that implicitly trusted. */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); send_monitor_note(sk, "Bluetooth subsystem version %u.%u", BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; case HCI_CHANNEL_LOGGING: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } if (!capable(CAP_NET_ADMIN)) { err = -EPERM; goto done; } hci_pi(sk)->channel = haddr.hci_channel; break; default: if (!hci_mgmt_chan_find(haddr.hci_channel)) { err = -EINVAL; goto done; } if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } /* Users with CAP_NET_ADMIN capabilities are allowed * access to all management commands and events. For * untrusted users the interface is restricted and * also only untrusted events are sent. */ if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->channel = haddr.hci_channel; /* At the moment the index and unconfigured index events * are enabled unconditionally. Setting them on each * socket when binding keeps this functionality. They * however might be cleared later and then sending of these * events will be disabled, but that is then intentional. * * This also enables generic events that are safe to be * received by untrusted users. Example for such events * are changes to settings, class of device, name etc. */ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been * assigned, this socket will transition from * a raw socket into a control socket. To * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } /* Default MTU to HCI_MAX_FRAME_SIZE if not set */ if (!hci_pi(sk)->mtu) hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE; sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; struct hci_dev *hdev; int err = 0; BT_DBG("sock %p sk %p", sock, sk); if (peer) return -EOPNOTSUPP; lock_sock(sk); hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto done; } haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; haddr->hci_channel= hci_pi(sk)->channel; err = sizeof(*haddr); done: release_sock(sk); return err; } static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { __u8 mask = hci_pi(sk)->cmsg_mask; if (mask & HCI_CMSG_DIR) { int incoming = bt_cb(skb)->incoming; put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); } if (mask & HCI_CMSG_TSTAMP) { #ifdef CONFIG_COMPAT struct old_timeval32 ctv; #endif struct __kernel_old_timeval tv; void *data; int len; skb_get_timestamp(skb, &tv); data = &tv; len = sizeof(tv); #ifdef CONFIG_COMPAT if (!COMPAT_USE_64BIT_TIME && (msg->msg_flags & MSG_CMSG_COMPAT)) { ctv.tv_sec = tv.tv_sec; ctv.tv_usec = tv.tv_usec; data = &ctv; len = sizeof(ctv); } #endif put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data); } } static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; unsigned int skblen; BT_DBG("sock %p, sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING) return -EOPNOTSUPP; if (sk->sk_state == BT_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: hci_sock_cmsg(sk, msg, skb); break; case HCI_CHANNEL_USER: case HCI_CHANNEL_MONITOR: sock_recv_timestamp(msg, sk, skb); break; default: if (hci_mgmt_chan_find(hci_pi(sk)->channel)) sock_recv_timestamp(msg, sk, skb); break; } memset(&scm, 0, sizeof(scm)); scm.creds = bt_cb(skb)->creds; skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; scm_recv(sock, msg, &scm, flags); return err ? : copied; } static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, struct sk_buff *skb) { u8 *cp; struct mgmt_hdr *hdr; u16 opcode, index, len; struct hci_dev *hdev = NULL; const struct hci_mgmt_handler *handler; bool var_len, no_hdev; int err; BT_DBG("got %d bytes", skb->len); if (skb->len < sizeof(*hdr)) return -EINVAL; hdr = (void *)skb->data; opcode = __le16_to_cpu(hdr->opcode); index = __le16_to_cpu(hdr->index); len = __le16_to_cpu(hdr->len); if (len != skb->len - sizeof(*hdr)) { err = -EINVAL; goto done; } if (chan->channel == HCI_CHANNEL_CONTROL) { struct sk_buff *cmd; /* Send event to monitor */ cmd = create_monitor_ctrl_command(sk, index, opcode, len, skb->data + sizeof(*hdr)); if (cmd) { hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd, HCI_SOCK_TRUSTED, NULL); kfree_skb(cmd); } } if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_UNKNOWN_COMMAND); goto done; } handler = &chan->handlers[opcode]; if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) && !(handler->flags & HCI_MGMT_UNTRUSTED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_PERMISSION_DENIED); goto done; } if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !(handler->flags & HCI_MGMT_UNCONFIGURED)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) { no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); if (no_hdev != !hdev) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } } var_len = (handler->flags & HCI_MGMT_VAR_LEN); if ((var_len && len < handler->data_len) || (!var_len && len != handler->data_len)) { err = mgmt_cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_PARAMS); goto done; } if (hdev && chan->hdev_init) chan->hdev_init(sk, hdev); cp = skb->data + sizeof(*hdr); err = handler->func(sk, hdev, cp, len); if (err < 0) goto done; err = skb->len; done: if (hdev) hci_dev_put(hdev); return err; } static int hci_logging_frame(struct sock *sk, struct sk_buff *skb, unsigned int flags) { struct hci_mon_hdr *hdr; struct hci_dev *hdev; u16 index; int err; /* The logging frame consists at minimum of the standard header, * the priority byte, the ident length byte and at least one string * terminator NUL byte. Anything shorter are invalid packets. */ if (skb->len < sizeof(*hdr) + 3) return -EINVAL; hdr = (void *)skb->data; if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr)) return -EINVAL; if (__le16_to_cpu(hdr->opcode) == 0x0000) { __u8 priority = skb->data[sizeof(*hdr)]; __u8 ident_len = skb->data[sizeof(*hdr) + 1]; /* Only the priorities 0-7 are valid and with that any other * value results in an invalid packet. * * The priority byte is followed by an ident length byte and * the NUL terminated ident string. Check that the ident * length is not overflowing the packet and also that the * ident string itself is NUL terminated. In case the ident * length is zero, the length value actually doubles as NUL * terminator identifier. * * The message follows the ident string (if present) and * must be NUL terminated. Otherwise it is not a valid packet. */ if (priority > 7 || skb->data[skb->len - 1] != 0x00 || ident_len > skb->len - sizeof(*hdr) - 3 || skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) return -EINVAL; } else { return -EINVAL; } index = __le16_to_cpu(hdr->index); if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); if (!hdev) return -ENODEV; } else { hdev = NULL; } hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); err = skb->len; if (hdev) hci_dev_put(hdev); return err; } static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct hci_mgmt_chan *chan; struct hci_dev *hdev; struct sk_buff *skb; int err; const unsigned int flags = msg->msg_flags; BT_DBG("sock %p sk %p", sock, sk); if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (len < 4 || len > hci_pi(sk)->mtu) return -EINVAL; skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: break; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; goto drop; case HCI_CHANNEL_LOGGING: err = hci_logging_frame(sk, skb, flags); goto drop; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); if (chan) err = hci_mgmt_cmd(chan, sk, skb); else err = -EINVAL; mutex_unlock(&mgmt_chan_list_lock); goto drop; } hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); goto drop; } if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto drop; } hci_skb_pkt_type(skb) = skb->data[0]; skb_pull(skb, 1); if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { /* No permission check is needed for user channel * since that gets enforced when binding the socket. * * However check that the packet type is valid. */ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && hci_skb_pkt_type(skb) != HCI_DRV_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) { u16 opcode = get_unaligned_le16(skb->data); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); if (((ogf > HCI_SFLT_MAX_OGF) || !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) && !capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } /* Since the opcode has already been extracted here, store * a copy of the value for later use by the drivers. */ hci_skb_opcode(skb) = opcode; if (ogf == 0x3f) { skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else { /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } } else { if (!capable(CAP_NET_RAW)) { err = -EPERM; goto drop; } if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } err = len; done: release_sock(sk); return err; drop: kfree_skb(skb); goto done; } static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; int err = 0, opt = 0; BT_DBG("sk %p, opt %d", sk, optname); lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR; break; case HCI_TIME_STAMP: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; else hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } err = copy_safe_from_sockptr(&uf, sizeof(uf), optval, optlen); if (err) break; if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0); uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1); } { struct hci_filter *f = &hci_pi(sk)->filter; f->type_mask = uf.type_mask; f->opcode = uf.opcode; *((u32 *) f->event_mask + 0) = uf.event_mask[0]; *((u32 *) f->event_mask + 1) = uf.event_mask[1]; } break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; u16 opt; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_setsockopt_old(sock, level, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: switch (hci_pi(sk)->channel) { /* Don't allow changing MTU for channels that are meant for HCI * traffic only. */ case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: err = -ENOPROTOOPT; goto done; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; hci_pi(sk)->mtu = opt; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct hci_ufilter uf; struct sock *sk = sock->sk; int len, opt, err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { err = -EBADFD; goto done; } switch (optname) { case HCI_DATA_DIR: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_TIME_STAMP: if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) err = -EFAULT; break; case HCI_FILTER: { struct hci_filter *f = &hci_pi(sk)->filter; memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); uf.event_mask[1] = *((u32 *) f->event_mask + 1); } len = min_t(unsigned int, len, sizeof(uf)); if (copy_to_user(optval, &uf, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } done: release_sock(sk); return err; } static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p, opt %d", sk, optname); if (level == SOL_HCI) return hci_sock_getsockopt_old(sock, level, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SNDMTU: case BT_RCVMTU: if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static void hci_sock_destruct(struct sock *sk) { mgmt_cleanup(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, .bind = hci_sock_bind, .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hci_sock_compat_ioctl, #endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, .getsockopt = hci_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto hci_sk_proto = { .name = "HCI", .owner = THIS_MODULE, .obj_size = sizeof(struct hci_pinfo) }; static int hci_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sock->ops = &hci_sock_ops; sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->state = SS_UNCONNECTED; sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); return 0; } static const struct net_proto_family hci_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = hci_sock_create, }; int __init hci_sock_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_hci) > sizeof(struct sockaddr)); err = proto_register(&hci_sk_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops); if (err < 0) { BT_ERR("HCI socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HCI proc file"); bt_sock_unregister(BTPROTO_HCI); goto error; } BT_INFO("HCI socket layer initialized"); return 0; error: proto_unregister(&hci_sk_proto); return err; } void hci_sock_cleanup(void) { bt_procfs_cleanup(&init_net, "hci"); bt_sock_unregister(BTPROTO_HCI); proto_unregister(&hci_sk_proto); }
176 172 176 176 172 172 77 172 172 172 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include "internal.h" static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif #ifdef CONFIG_UTS_NS &utsns_operations, #endif #ifdef CONFIG_IPC_NS &ipcns_operations, #endif #ifdef CONFIG_PID_NS &pidns_operations, &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, #endif &mntns_operations, #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif #ifdef CONFIG_TIME_NS &timens_operations, &timens_for_children_operations, #endif }; static const char *proc_ns_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) return ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; error = ns_get_path(&ns_path, task, ns_ops); if (error) goto out; error = nd_jump_link(&ns_path); out: put_task_struct(task); return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) return res; if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; } static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, .get_link = proc_ns_get_link, .setattr = proc_setattr, }; static struct dentry *proc_ns_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) goto out; entry = ns_entries + (ctx->pos - 2); last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; while (entry <= last) { const struct proc_ns_operations *ops = *entry; if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), proc_ns_instantiate, task, ops)) break; ctx->pos++; entry++; } out: put_task_struct(task); return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_ns_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } if (entry == last) goto out; res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: return res; } const struct inode_operations proc_ns_dir_inode_operations = { .lookup = proc_ns_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, };
129 70 264 62 47 77 15 18 32 94 28 185 185 145 38 38 15 264 8 50 58 58 266 19 43 19 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif
170 78 78 18 22 22 22 4 4 4 1 1 4 2 16 20 5 2 37 25 20 2 5 2 3 7 3 5 3 6 5 5 4 4 91 2 9 2 2 5 2 37 12 6 2 48 6 13 5 5 3 35 49 91 3 13 1 12 12 7 5 7 15 5 10 10 8 8 8 7 1 7 10 10 33 17 2 22 3 16 16 6 16 49 4 3 1 4 8 4 5 6 3 270 269 268 1 99 175 170 168 7 33 48 87 2 2 2 2 15 15 13 1 1 1 1 1 14 13 2 15 15 15 15 15 218 218 218 202 15 4 1 3 4 1 3 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2021, Red Hat. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #define MIN_INFO_OPTLEN_SIZE 16 #define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { msk_owned_by_me(msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; return msk->first; } static u32 sockopt_seq_reset(const struct sock *sk) { sock_owned_by_me(sk); /* Highbits contain state. Allows to distinguish sockopt_seq * of listener and established: * s0 = new_listener() * sockopt(s0) - seq is 1 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0) * sockopt(s0) - seq increments to 2 on s0 * sockopt(s1) // seq increments to 2 on s1 (different option) * new ssk completes join, inherits options from s0 // seq 2 * Needs sync from mptcp join logic, but ssk->seq == msk->seq * * Set High order bits to sk_state so ssk->seq == msk->seq test * will fail. */ return (u32)sk->sk_state << 24u; } static void sockopt_seq_inc(struct mptcp_sock *msk) { u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff; msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq; } static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen, int *val) { if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(val, optval, sizeof(*val))) return -EFAULT; return 0; } static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); switch (optname) { case SO_DEBUG: sock_valbool_flag(ssk, SOCK_DBG, !!val); break; case SO_KEEPALIVE: if (ssk->sk_prot->keepalive) ssk->sk_prot->keepalive(ssk, !!val); sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val); break; case SO_PRIORITY: WRITE_ONCE(ssk->sk_priority, val); break; case SO_SNDBUF: case SO_SNDBUFFORCE: ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; break; case SO_RCVBUF: case SO_RCVBUFFORCE: ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { WRITE_ONCE(ssk->sk_mark, sk->sk_mark); sk_dst_reset(ssk); } break; case SO_INCOMING_CPU: WRITE_ONCE(ssk->sk_incoming_cpu, val); break; } subflow->setsockopt_seq = msk->setsockopt_seq; unlock_sock_fast(ssk, slow); } release_sock(sk); } static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val) { sockptr_t optval = KERNEL_SOCKPTR(&val); struct sock *sk = (struct sock *)msk; int ret; ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, sizeof(val)); if (ret) return ret; mptcp_sol_socket_sync_intval(msk, optname, val); return 0; } static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) { struct sock *sk = (struct sock *)msk; WRITE_ONCE(sk->sk_incoming_cpu, val); mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) { sockptr_t optval = KERNEL_SOCKPTR(&val); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int ret; ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, sizeof(val)); if (ret) return ret; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); sock_set_timestamp(sk, optname, !!val); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { int val, ret; ret = mptcp_get_int_option(msk, optval, optlen, &val); if (ret) return ret; switch (optname) { case SO_KEEPALIVE: case SO_DEBUG: case SO_MARK: case SO_PRIORITY: case SO_SNDBUF: case SO_SNDBUFFORCE: case SO_RCVBUF: case SO_RCVBUFFORCE: return mptcp_sol_socket_intval(msk, optname, val); case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; } static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct so_timestamping timestamping; int ret; if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) return -EFAULT; } else if (optlen == sizeof(int)) { memset(&timestamping, 0, sizeof(timestamping)); if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int))) return -EFAULT; } else { return -EINVAL; } ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, KERNEL_SOCKPTR(&timestamping), sizeof(timestamping)); if (ret) return ret; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); sock_set_timestamping(sk, optname, timestamping); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct linger ling; sockptr_t kopt; int ret; if (optlen < sizeof(ling)) return -EINVAL; if (copy_from_sockptr(&ling, optval, sizeof(ling))) return -EFAULT; kopt = KERNEL_SOCKPTR(&ling); ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling)); if (ret) return ret; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); if (!ling.l_onoff) { sock_reset_flag(ssk, SOCK_LINGER); } else { ssk->sk_lingertime = sk->sk_lingertime; sock_set_flag(ssk, SOCK_LINGER); } subflow->setsockopt_seq = msk->setsockopt_seq; unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; switch (optname) { case SO_REUSEPORT: case SO_REUSEADDR: case SO_BINDTODEVICE: case SO_BINDTOIFINDEX: lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) sk->sk_reuseport = ssk->sk_reuseport; else if (optname == SO_REUSEADDR) sk->sk_reuse = ssk->sk_reuse; else if (optname == SO_BINDTODEVICE) sk->sk_bound_dev_if = ssk->sk_bound_dev_if; else if (optname == SO_BINDTOIFINDEX) sk->sk_bound_dev_if = ssk->sk_bound_dev_if; } release_sock(sk); return ret; case SO_KEEPALIVE: case SO_PRIORITY: case SO_SNDBUF: case SO_SNDBUFFORCE: case SO_RCVBUF: case SO_RCVBUFFORCE: case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: return mptcp_setsockopt_sol_socket_timestamping(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); case SO_RCVLOWAT: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: case SO_BUSY_POLL: case SO_PREFER_BUSY_POLL: case SO_BUSY_POLL_BUDGET: /* No need to copy: only relevant for msk */ return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: case SO_NOFCS: case SO_SELECT_ERR_QUEUE: return 0; } /* SO_OOBINLINE is not supported, let's avoid the related mess * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, * we must be careful with subflows * * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks * explicitly the sk_protocol field * * SO_PEEK_OFF is unsupported, as it is for plain TCP * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, * but likely needs careful design * * SO_ZEROCOPY is currently unsupported, TODO in sndmsg * SO_TXTIME is currently unsupported */ return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; struct sock *ssk; switch (optname) { case IPV6_V6ONLY: case IPV6_TRANSPARENT: case IPV6_FREEBIND: lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen); if (ret != 0) { release_sock(sk); return ret; } sockopt_seq_inc(msk); switch (optname) { case IPV6_V6ONLY: sk->sk_ipv6only = ssk->sk_ipv6only; break; case IPV6_TRANSPARENT: inet_assign_bit(TRANSPARENT, sk, inet_test_bit(TRANSPARENT, ssk)); break; case IPV6_FREEBIND: inet_assign_bit(FREEBIND, sk, inet_test_bit(FREEBIND, ssk)); break; } release_sock(sk); break; } return ret; } static bool mptcp_supported_sockopt(int level, int optname) { if (level == SOL_IP) { switch (optname) { /* should work fine */ case IP_FREEBIND: case IP_TRANSPARENT: case IP_BIND_ADDRESS_NO_PORT: case IP_LOCAL_PORT_RANGE: /* the following are control cmsg related */ case IP_PKTINFO: case IP_RECVTTL: case IP_RECVTOS: case IP_RECVOPTS: case IP_RETOPTS: case IP_PASSSEC: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: /* common stuff that need some love */ case IP_TOS: case IP_TTL: case IP_MTU_DISCOVER: case IP_RECVERR: /* possibly less common may deserve some love */ case IP_MINTTL: /* the following is apparently a no-op for plain TCP */ case IP_RECVERR_RFC4884: return true; } /* IP_OPTIONS is not supported, needs subflow care */ /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */ /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF, * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP, * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE, * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP, * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal * with mcast stuff */ /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */ return false; } if (level == SOL_IPV6) { switch (optname) { case IPV6_V6ONLY: /* the following are control cmsg related */ case IPV6_RECVPKTINFO: case IPV6_2292PKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_2292HOPLIMIT: case IPV6_RECVRTHDR: case IPV6_2292RTHDR: case IPV6_RECVHOPOPTS: case IPV6_2292HOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_2292DSTOPTS: case IPV6_RECVTCLASS: case IPV6_FLOWINFO: case IPV6_RECVPATHMTU: case IPV6_RECVORIGDSTADDR: case IPV6_RECVFRAGSIZE: /* the following ones need some love but are quite common */ case IPV6_TCLASS: case IPV6_TRANSPARENT: case IPV6_FREEBIND: case IPV6_PKTINFO: case IPV6_2292PKTOPTIONS: case IPV6_UNICAST_HOPS: case IPV6_MTU_DISCOVER: case IPV6_MTU: case IPV6_RECVERR: case IPV6_FLOWINFO_SEND: case IPV6_FLOWLABEL_MGR: case IPV6_MINHOPCOUNT: case IPV6_DONTFRAG: case IPV6_AUTOFLOWLABEL: /* the following one is a no-op for plain TCP */ case IPV6_RECVERR_RFC4884: return true; } /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are * not supported */ /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF, * IPV6_MULTICAST_IF, IPV6_ADDRFORM, * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST, * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP, * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP, * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER * are not supported better not deal with mcast */ /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */ /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */ /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */ return false; } if (level == SOL_TCP) { switch (optname) { /* the following are no-op or should work just fine */ case TCP_THIN_DUPACK: case TCP_DEFER_ACCEPT: /* the following need some love */ case TCP_MAXSEG: case TCP_NODELAY: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_CONGESTION: case TCP_CORK: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_SAVE_SYN: case TCP_LINGER2: case TCP_WINDOW_CLAMP: case TCP_QUICKACK: case TCP_USER_TIMEOUT: case TCP_TIMESTAMP: case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: case TCP_INQ: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return true; } /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */ /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ } return false; } static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; char name[TCP_CA_NAME_MAX]; bool cap_net_admin; int ret; if (optlen < 1) return -EINVAL; ret = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX - 1, optlen)); if (ret < 0) return -EFAULT; name[ret] = 0; cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN); ret = 0; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err; lock_sock(ssk); err = tcp_set_congestion_control(ssk, name, true, cap_net_admin); if (err < 0 && ret == 0) ret = err; subflow->setsockopt_seq = msk->setsockopt_seq; release_sock(ssk); } if (ret == 0) strscpy(msk->ca_name, name, sizeof(msk->ca_name)); release_sock(sk); return ret; } static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max, int (*set_val)(struct sock *, int), int *msk_val, int val) { struct mptcp_subflow_context *subflow; int err = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ret; lock_sock(ssk); ret = set_val(ssk, val); err = err ? : ret; release_sock(ssk); } if (!err) { *msk_val = val; sockopt_seq_inc(msk); } return err; } static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; sockopt_seq_inc(msk); msk->cork = !!val; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); __tcp_sock_set_cork(ssk, !!val); release_sock(ssk); } if (!val) mptcp_check_and_set_pending(sk); return 0; } static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; sockopt_seq_inc(msk); msk->nodelay = !!val; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); __tcp_sock_set_nodelay(ssk, !!val); release_sock(ssk); } if (val) mptcp_check_and_set_pending(sk); return 0; } static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int err; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); if (err != 0) return err; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } switch (optname) { case IP_FREEBIND: inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); break; case IP_TRANSPARENT: inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); break; case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); break; case IP_LOCAL_PORT_RANGE: WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); break; default: release_sock(sk); WARN_ON_ONCE(1); return -EOPNOTSUPP; } sockopt_seq_inc(msk); release_sock(sk); return 0; } static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int err, val; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); if (err != 0) return err; lock_sock(sk); sockopt_seq_inc(msk); val = READ_ONCE(inet_sk(sk)->tos); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(ssk); __ip_sock_set_tos(ssk, val); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: case IP_BIND_ADDRESS_NO_PORT: case IP_LOCAL_PORT_RANGE: return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); } return -EOPNOTSUPP; } static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { ret = PTR_ERR(ssk); goto unlock; } ret = tcp_setsockopt(ssk, level, optname, optval, optlen); unlock: release_sock(sk); return ret; } static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; int ret = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ret = tcp_setsockopt(ssk, level, optname, optval, optlen); if (ret) break; } return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (void *)msk; int ret, val; switch (optname) { case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); case TCP_DEFER_ACCEPT: /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); return 0; case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); } ret = mptcp_get_int_option(msk, optval, optlen, &val); if (ret) return ret; lock_sock(sk); switch (optname) { case TCP_INQ: if (val < 0 || val > 1) ret = -EINVAL; else msk->recvmsg_inq = !!val; break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(msk->notsent_lowat, val); mptcp_write_space(sk); break; case TCP_CORK: ret = __mptcp_setsockopt_sol_tcp_cork(msk, val); break; case TCP_NODELAY: ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val); break; case TCP_KEEPIDLE: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE, &tcp_sock_set_keepidle_locked, &msk->keepalive_idle, val); break; case TCP_KEEPINTVL: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL, &tcp_sock_set_keepintvl, &msk->keepalive_intvl, val); break; case TCP_KEEPCNT: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT, &tcp_sock_set_keepcnt, &msk->keepalive_cnt, val); break; case TCP_MAXSEG: msk->maxseg = val; ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval, optlen); break; default: ret = -ENOPROTOOPT; } release_sock(sk); return ret; } int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; pr_debug("msk=%p\n", msk); if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); if (!mptcp_supported_sockopt(level, optname)) return -ENOPROTOOPT; /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow * is in TCP fallback, when TCP socket options are passed through * to the one remaining subflow. */ lock_sock(sk); ssk = __mptcp_tcp_fallback(msk); release_sock(sk); if (ssk) return tcp_setsockopt(ssk, level, optname, optval, optlen); if (level == SOL_IP) return mptcp_setsockopt_v4(msk, optname, optval, optlen); if (level == SOL_IPV6) return mptcp_setsockopt_v6(msk, optname, optval, optlen); if (level == SOL_TCP) return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen); return -EOPNOTSUPP; } static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; lock_sock(sk); ssk = msk->first; if (ssk) goto get; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { ret = PTR_ERR(ssk); goto out; } get: ret = tcp_getsockopt(ssk, level, optname, optval, optlen); out: release_sock(sk); return ret; } void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { struct sock *sk = (struct sock *)msk; u32 flags = 0; bool slow; u32 now; memset(info, 0, sizeof(*info)); info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); if (inet_sk_state_load(sk) == TCP_LISTEN) return; /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); info->mptcpi_endp_laminar_max = mptcp_pm_get_endp_laminar_max(msk); info->mptcpi_endp_fullmesh_max = mptcp_pm_get_endp_fullmesh_max(msk); } if (__mptcp_check_fallback(msk)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; slow = lock_sock_fast(sk); info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); info->mptcpi_token = msk->token; info->mptcpi_write_seq = msk->write_seq; info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv); unlock_sock_fast(sk, slow); mptcp_data_lock(sk); info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv); info->mptcpi_snd_una = msk->snd_una; info->mptcpi_rcv_nxt = msk->ack_seq; info->mptcpi_bytes_acked = msk->bytes_acked; mptcp_data_unlock(sk); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_info m_info; int len; if (get_user(len, optlen)) return -EFAULT; /* When used only to check if a fallback to TCP happened. */ if (len == 0) return 0; len = min_t(unsigned int, len, sizeof(struct mptcp_info)); mptcp_diag_fill_info(msk, &m_info); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &m_info, len)) return -EFAULT; return 0; } static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, char __user *optval, u32 copied, int __user *optlen) { u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd)); if (copied) copied += sfd->size_subflow_data; else copied = copylen; if (put_user(copied, optlen)) return -EFAULT; if (copy_to_user(optval, sfd, copylen)) return -EFAULT; return 0; } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, char __user *optval, int __user *optlen) { int len, copylen; if (get_user(len, optlen)) return -EFAULT; /* if mptcp_subflow_data size is changed, need to adjust * this function to deal with programs using old version. */ BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE); if (len < MIN_INFO_OPTLEN_SIZE) return -EINVAL; memset(sfd, 0, sizeof(*sfd)); copylen = min_t(unsigned int, len, sizeof(*sfd)); if (copy_from_user(sfd, optval, copylen)) return -EFAULT; /* size_subflow_data is u32, but len is signed */ if (sfd->size_subflow_data > INT_MAX || sfd->size_user > INT_MAX) return -EINVAL; if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE || sfd->size_subflow_data > len) return -EINVAL; if (sfd->num_subflows || sfd->size_kernel) return -EINVAL; return len - sfd->size_subflow_data; } static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *infoptr; int len; len = mptcp_get_subflow_data(&sfd, optval, optlen); if (len < 0) return len; sfd.size_kernel = sizeof(struct tcp_info); sfd.size_user = min_t(unsigned int, sfd.size_user, sizeof(struct tcp_info)); infoptr = optval + sfd.size_subflow_data; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ++sfcount; if (len && len >= sfd.size_user) { struct tcp_info info; tcp_get_info(ssk, &info); if (copy_to_user(infoptr, &info, sfd.size_user)) { release_sock(sk); return -EFAULT; } infoptr += sfd.size_user; copied += sfd.size_user; len -= sfd.size_user; } } release_sock(sk); sfd.num_subflows = sfcount; if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) return -EFAULT; return 0; } static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) { const struct inet_sock *inet = inet_sk(sk); memset(a, 0, sizeof(*a)); if (sk->sk_family == AF_INET) { a->sin_local.sin_family = AF_INET; a->sin_local.sin_port = inet->inet_sport; a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr; if (!a->sin_local.sin_addr.s_addr) a->sin_local.sin_addr.s_addr = inet->inet_saddr; a->sin_remote.sin_family = AF_INET; a->sin_remote.sin_port = inet->inet_dport; a->sin_remote.sin_addr.s_addr = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); if (WARN_ON_ONCE(!np)) return; a->sin6_local.sin6_family = AF_INET6; a->sin6_local.sin6_port = inet->inet_sport; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) a->sin6_local.sin6_addr = np->saddr; else a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr; a->sin6_remote.sin6_family = AF_INET6; a->sin6_remote.sin6_port = inet->inet_dport; a->sin6_remote.sin6_addr = sk->sk_v6_daddr; #endif } } static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *addrptr; int len; len = mptcp_get_subflow_data(&sfd, optval, optlen); if (len < 0) return len; sfd.size_kernel = sizeof(struct mptcp_subflow_addrs); sfd.size_user = min_t(unsigned int, sfd.size_user, sizeof(struct mptcp_subflow_addrs)); addrptr = optval + sfd.size_subflow_data; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ++sfcount; if (len && len >= sfd.size_user) { struct mptcp_subflow_addrs a; mptcp_get_sub_addrs(ssk, &a); if (copy_to_user(addrptr, &a, sfd.size_user)) { release_sock(sk); return -EFAULT; } addrptr += sfd.size_user; copied += sfd.size_user; len -= sfd.size_user; } } release_sock(sk); sfd.num_subflows = sfcount; if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) return -EFAULT; return 0; } static int mptcp_get_full_info(struct mptcp_full_info *mfi, char __user *optval, int __user *optlen) { int len; BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != MIN_FULL_INFO_OPTLEN_SIZE); if (get_user(len, optlen)) return -EFAULT; if (len < MIN_FULL_INFO_OPTLEN_SIZE) return -EINVAL; memset(mfi, 0, sizeof(*mfi)); if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) return -EFAULT; if (mfi->size_tcpinfo_kernel || mfi->size_sfinfo_kernel || mfi->num_subflows) return -EINVAL; if (mfi->size_sfinfo_user > INT_MAX || mfi->size_tcpinfo_user > INT_MAX) return -EINVAL; return len - MIN_FULL_INFO_OPTLEN_SIZE; } static int mptcp_put_full_info(struct mptcp_full_info *mfi, char __user *optval, u32 copylen, int __user *optlen) { copylen += MIN_FULL_INFO_OPTLEN_SIZE; if (put_user(copylen, optlen)) return -EFAULT; if (copy_to_user(optval, mfi, copylen)) return -EFAULT; return 0; } static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { unsigned int sfcount = 0, copylen = 0; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; void __user *tcpinfoptr, *sfinfoptr; struct mptcp_full_info mfi; int len; len = mptcp_get_full_info(&mfi, optval, optlen); if (len < 0) return len; /* don't bother filling the mptcp info if there is not enough * user-space-provided storage */ if (len > 0) { mptcp_diag_fill_info(msk, &mfi.mptcp_info); copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); } mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, sizeof(struct tcp_info)); sfinfoptr = u64_to_user_ptr(mfi.subflow_info); mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, sizeof(struct mptcp_subflow_info)); tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_subflow_info sfinfo; struct tcp_info tcp_info; if (sfcount++ >= mfi.size_arrays_user) continue; /* fetch addr/tcp_info only if the user space buffers * are wide enough */ memset(&sfinfo, 0, sizeof(sfinfo)); sfinfo.id = subflow->subflow_id; if (mfi.size_sfinfo_user > offsetof(struct mptcp_subflow_info, addrs)) mptcp_get_sub_addrs(ssk, &sfinfo.addrs); if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) goto fail_release; if (mfi.size_tcpinfo_user) { tcp_get_info(ssk, &tcp_info); if (copy_to_user(tcpinfoptr, &tcp_info, mfi.size_tcpinfo_user)) goto fail_release; } tcpinfoptr += mfi.size_tcpinfo_user; sfinfoptr += mfi.size_sfinfo_user; } release_sock(sk); mfi.num_subflows = sfcount; if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) return -EFAULT; return 0; fail_release: release_sock(sk); return -EFAULT; } static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { int len; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; } return 0; } static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case TCP_ULP: case TCP_CONGESTION: case TCP_INFO: case TCP_CC_INFO: case TCP_DEFER_ACCEPT: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq); case TCP_CORK: return mptcp_put_int_option(msk, optval, optlen, msk->cork); case TCP_NODELAY: return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); case TCP_KEEPIDLE: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_idle ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ); case TCP_KEEPINTVL: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_intvl ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ); case TCP_KEEPCNT: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_cnt ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes)); case TCP_NOTSENT_LOWAT: return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); case TCP_IS_MPTCP: return mptcp_put_int_option(msk, optval, optlen, 1); case TCP_MAXSEG: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); } return -EOPNOTSUPP; } static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); case IP_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); case IP_TRANSPARENT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(TRANSPARENT, sk)); case IP_BIND_ADDRESS_NO_PORT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); } return -EOPNOTSUPP; } static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case IPV6_V6ONLY: return mptcp_put_int_option(msk, optval, optlen, sk->sk_ipv6only); case IPV6_TRANSPARENT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(TRANSPARENT, sk)); case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); } return -EOPNOTSUPP; } static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); case MPTCP_FULL_INFO: return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: return mptcp_getsockopt_subflow_addrs(msk, optval, optlen); } return -EOPNOTSUPP; } int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; pr_debug("msk=%p\n", msk); /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow * is in TCP fallback, when socket options are passed through * to the one remaining subflow. */ lock_sock(sk); ssk = __mptcp_tcp_fallback(msk); release_sock(sk); if (ssk) return tcp_getsockopt(ssk, level, optname, optval, option); if (level == SOL_IP) return mptcp_getsockopt_v4(msk, optname, optval, option); if (level == SOL_IPV6) return mptcp_getsockopt_v6(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option); return -EOPNOTSUPP; } static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); if (ssk->sk_prot->keepalive) ssk->sk_prot->keepalive(ssk, keep_open); sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; ssk->sk_incoming_cpu = sk->sk_incoming_cpu; ssk->sk_ipv6only = sk->sk_ipv6only; __ip_sock_set_tos(ssk, inet_sk(sk)->tos); if (sk->sk_userlocks & tx_rx_locks) { ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) { WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); } if (sock_flag(sk, SOCK_LINGER)) { ssk->sk_lingertime = sk->sk_lingertime; sock_set_flag(ssk, SOCK_LINGER); } else { sock_reset_flag(ssk, SOCK_LINGER); } if (sk->sk_mark != ssk->sk_mark) { ssk->sk_mark = sk->sk_mark; sk_dst_reset(ssk); } sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG)); if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) tcp_set_congestion_control(ssk, msk->ca_name, false, true); __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle); tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl); tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); tcp_sock_set_maxseg(ssk, msk->maxseg); inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); msk_owned_by_me(msk); ssk->sk_rcvlowat = 0; /* subflows must ignore any latency-related settings: will not affect * the user-space - only the msk is relevant - but will foul the * mptcp scheduler */ tcp_sk(ssk)->notsent_lowat = UINT_MAX; if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { sync_socket_options(msk, ssk); subflow->setsockopt_seq = msk->setsockopt_seq; } } /* unfortunately this is different enough from the tcp version so * that we can't factor it out */ int mptcp_set_rcvlowat(struct sock *sk, int val) { struct mptcp_subflow_context *subflow; int space, cap; /* bpf can land here with a wrong sk type */ if (sk->sk_protocol == IPPROTO_TCP) return -EINVAL; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ if (mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = mptcp_space_from_win(sk, val); if (space <= sk->sk_rcvbuf) return 0; /* propagate the rcvbuf changes to all the subflows */ WRITE_ONCE(sk->sk_rcvbuf, space); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, space); WRITE_ONCE(tcp_sk(ssk)->window_clamp, val); unlock_sock_fast(ssk, slow); } return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 /* SPDX-License-Identifier: GPL-2.0 */ /* * Header for use in defining a given L4 protocol for connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalized L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h */ #ifndef _NF_CONNTRACK_L4PROTO_H #define _NF_CONNTRACK_L4PROTO_H #include <linux/netlink.h> #include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netns/generic.h> struct seq_file; struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; /* Resolve clashes on insertion races. */ bool allow_clash; /* protoinfo nlattr size, closes a hole */ u16 nlattr_size; /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); /* convert protoinfo to nfnetlink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); const struct nla_policy *nla_policy; struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); u16 obj_size; u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); #endif }; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr); int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO IPPROTO_UDPLITE const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...); __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...); #else static inline __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) {} static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { return &net->ct.nf_ct_proto.sctp; } #endif #ifdef CONFIG_NF_CT_PROTO_GRE static inline struct nf_gre_net *nf_gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } #endif #endif /*_NF_CONNTRACK_PROTOCOL_H*/
820 819 152 83 136 136 136 136 797 797 826 772 120 688 152 799 9 9 9 869 725 211 868 579 27 688 800 579 801 772 55 686 800 338 632 9 825 814 826 826 826 1 3 4 4 6 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 // SPDX-License-Identifier: GPL-2.0 /* * kernel userspace event delivery * * Copyright (C) 2004 Red Hat, Inc. All rights reserved. * Copyright (C) 2004 Novell, Inc. All rights reserved. * Copyright (C) 2004 IBM, Inc. All rights reserved. * * Authors: * Robert Love <rml@novell.com> * Kay Sievers <kay.sievers@vrfy.org> * Arjan van de Ven <arjanv@redhat.com> * Greg Kroah-Hartman <greg@kroah.com> */ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/kobject.h> #include <linux/export.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/uidgid.h> #include <linux/uuid.h> #include <linux/ctype.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> atomic64_t uevent_seqnum; #ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; #endif struct uevent_sock { struct list_head list; struct sock *sk; }; #ifdef CONFIG_NET static LIST_HEAD(uevent_sock_list); /* This lock protects uevent_sock_list */ static DEFINE_MUTEX(uevent_sock_mutex); #endif /* the strings here must match the enum in include/linux/kobject.h */ static const char *kobject_actions[] = { [KOBJ_ADD] = "add", [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", [KOBJ_BIND] = "bind", [KOBJ_UNBIND] = "unbind", }; static int kobject_action_type(const char *buf, size_t count, enum kobject_action *type, const char **args) { enum kobject_action action; size_t count_first; const char *args_start; int ret = -EINVAL; if (count && (buf[count-1] == '\n' || buf[count-1] == '\0')) count--; if (!count) goto out; args_start = strnchr(buf, count, ' '); if (args_start) { count_first = args_start - buf; args_start = args_start + 1; } else count_first = count; for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) { if (strncmp(kobject_actions[action], buf, count_first) != 0) continue; if (kobject_actions[action][count_first] != '\0') continue; if (args) *args = args_start; *type = action; ret = 0; break; } out: return ret; } static const char *action_arg_word_end(const char *buf, const char *buf_end, char delim) { const char *next = buf; while (next <= buf_end && *next != delim) if (!isalnum(*next++)) return NULL; if (next == buf) return NULL; return next; } static int kobject_action_args(const char *buf, size_t count, struct kobj_uevent_env **ret_env) { struct kobj_uevent_env *env = NULL; const char *next, *buf_end, *key; int key_len; int r = -EINVAL; if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0')) count--; if (!count) return -EINVAL; env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return -ENOMEM; /* first arg is UUID */ if (count < UUID_STRING_LEN || !uuid_is_valid(buf) || add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf)) goto out; /* * the rest are custom environment variables in KEY=VALUE * format with ' ' delimiter between each KEY=VALUE pair */ next = buf + UUID_STRING_LEN; buf_end = buf + count - 1; while (next <= buf_end) { if (*next != ' ') goto out; /* skip the ' ', key must follow */ key = ++next; if (key > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, '='); if (!next || next > buf_end || *next != '=') goto out; key_len = next - buf; /* skip the '=', value must follow */ if (++next > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, ' '); if (!next) goto out; if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s", key_len, key, (int) (next - buf), buf)) goto out; } r = 0; out: if (r) kfree(env); else *ret_env = env; return r; } /** * kobject_synth_uevent - send synthetic uevent with arguments * * @kobj: struct kobject for which synthetic uevent is to be generated * @buf: buffer containing action type and action args, newline is ignored * @count: length of buffer * * Returns 0 if kobject_synthetic_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count) { char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL }; enum kobject_action action; const char *action_args; struct kobj_uevent_env *env; const char *msg = NULL, *devpath; int r; r = kobject_action_type(buf, count, &action, &action_args); if (r) { msg = "unknown uevent action string"; goto out; } if (!action_args) { r = kobject_uevent_env(kobj, action, no_uuid_envp); goto out; } r = kobject_action_args(action_args, count - (action_args - buf), &env); if (r == -EINVAL) { msg = "incorrect uevent action arguments"; goto out; } if (r) goto out; r = kobject_uevent_env(kobj, action, env->envp); kfree(env); out: if (r) { devpath = kobject_get_path(kobj, GFP_KERNEL); pr_warn("synth uevent: %s: %s\n", devpath ?: "unknown device", msg ?: "failed to send uevent"); kfree(devpath); } return r; } #ifdef CONFIG_UEVENT_HELPER static int kobj_usermode_filter(struct kobject *kobj) { const struct kobj_ns_type_operations *ops; ops = kobj_ns_ops(kobj); if (ops) { const void *init_ns, *ns; ns = kobj->ktype->namespace(kobj); init_ns = ops->initial_ns(); return ns != init_ns; } return 0; } static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem) { int buffer_size = sizeof(env->buf) - env->buflen; int len; len = strscpy(&env->buf[env->buflen], subsystem, buffer_size); if (len < 0) { pr_warn("%s: insufficient buffer space (%u left) for %s\n", __func__, buffer_size, subsystem); return -ENOMEM; } env->argv[0] = uevent_helper; env->argv[1] = &env->buf[env->buflen]; env->argv[2] = NULL; env->buflen += len + 1; return 0; } static void cleanup_uevent_env(struct subprocess_info *info) { kfree(info->data); } #endif #ifdef CONFIG_NET static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct netlink_skb_parms *parms; struct sk_buff *skb = NULL; char *scratch; size_t len; /* allocate message with maximum possible size */ len = strlen(action_string) + strlen(devpath) + 2; skb = alloc_skb(len + env->buflen, GFP_KERNEL); if (!skb) return NULL; /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); skb_put_data(skb, env->buf, env->buflen); parms = &NETLINK_CB(skb); parms->creds.uid = GLOBAL_ROOT_UID; parms->creds.gid = GLOBAL_ROOT_GID; parms->dst_group = 1; parms->portid = 0; return skb; } static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct sk_buff *skb = NULL; struct uevent_sock *ue_sk; int retval = 0; /* send netlink message */ mutex_lock(&uevent_sock_mutex); list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; if (!netlink_has_listeners(uevent_sock, 1)) continue; if (!skb) { retval = -ENOMEM; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) continue; } retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (retval == -ENOBUFS || retval == -ESRCH) retval = 0; } mutex_unlock(&uevent_sock_mutex); consume_skb(skb); return retval; } static int uevent_net_broadcast_tagged(struct sock *usk, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct user_namespace *owning_user_ns = sock_net(usk)->user_ns; struct sk_buff *skb = NULL; int ret = 0; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) return -ENOMEM; /* fix credentials */ if (owning_user_ns != &init_user_ns) { struct netlink_skb_parms *parms = &NETLINK_CB(skb); kuid_t root_uid; kgid_t root_gid; /* fix uid */ root_uid = make_kuid(owning_user_ns, 0); if (uid_valid(root_uid)) parms->creds.uid = root_uid; /* fix gid */ root_gid = make_kgid(owning_user_ns, 0); if (gid_valid(root_gid)) parms->creds.gid = root_gid; } ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } #endif static int kobject_uevent_net_broadcast(struct kobject *kobj, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { int ret = 0; #ifdef CONFIG_NET const struct kobj_ns_type_operations *ops; const struct net *net = NULL; ops = kobj_ns_ops(kobj); if (!ops && kobj->kset) { struct kobject *ksobj = &kobj->kset->kobj; if (ksobj->parent != NULL) ops = kobj_ns_ops(ksobj->parent); } /* kobjects currently only carry network namespace tags and they * are the only tag relevant here since we want to decide which * network namespaces to broadcast the uevent into. */ if (ops && ops->netlink_ns && kobj->ktype->namespace) if (ops->type == KOBJ_NS_TYPE_NET) net = kobj->ktype->namespace(kobj); if (!net) ret = uevent_net_broadcast_untagged(env, action_string, devpath); else ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env, action_string, devpath); #endif return ret; } static void zap_modalias_env(struct kobj_uevent_env *env) { static const char modalias_prefix[] = "MODALIAS="; size_t len; int i, j; for (i = 0; i < env->envp_idx;) { if (strncmp(env->envp[i], modalias_prefix, sizeof(modalias_prefix) - 1)) { i++; continue; } len = strlen(env->envp[i]) + 1; if (i != env->envp_idx - 1) { /* @env->envp[] contains pointers to @env->buf[] * with @env->buflen chars, and we are removing * variable MODALIAS here pointed by @env->envp[i] * with length @len as shown below: * * 0 @env->buf[] @env->buflen * --------------------------------------------- * ^ ^ ^ ^ * | |-> @len <-| target block | * @env->envp[0] @env->envp[i] @env->envp[i + 1] * * so the "target block" indicated above is moved * backward by @len, and its right size is * @env->buflen - (@env->envp[i + 1] - @env->envp[0]). */ memmove(env->envp[i], env->envp[i + 1], env->buflen - (env->envp[i + 1] - env->envp[0])); for (j = i; j < env->envp_idx - 1; j++) env->envp[j] = env->envp[j + 1] - len; } env->envp_idx--; env->buflen -= len; } } /** * kobject_uevent_env - send an uevent with environmental data * * @kobj: struct kobject that the action is happening to * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp_ext[]) { struct kobj_uevent_env *env; const char *action_string = kobject_actions[action]; const char *devpath = NULL; const char *subsystem; struct kobject *top_kobj; struct kset *kset; const struct kset_uevent_ops *uevent_ops; int i = 0; int retval = 0; /* * Mark "remove" event done regardless of result, for some subsystems * do not want to re-trigger "remove" event via automatic cleanup. */ if (action == KOBJ_REMOVE) kobj->state_remove_uevent_sent = 1; pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); /* search the kset we belong to */ top_kobj = kobj; while (!top_kobj->kset && top_kobj->parent) top_kobj = top_kobj->parent; if (!top_kobj->kset) { pr_debug("kobject: '%s' (%p): %s: attempted to send uevent " "without kset!\n", kobject_name(kobj), kobj, __func__); return -EINVAL; } kset = top_kobj->kset; uevent_ops = kset->uevent_ops; /* skip the event, if uevent_suppress is set*/ if (kobj->uevent_suppress) { pr_debug("kobject: '%s' (%p): %s: uevent_suppress " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* skip the event, if the filter returns zero. */ if (uevent_ops && uevent_ops->filter) if (!uevent_ops->filter(kobj)) { pr_debug("kobject: '%s' (%p): %s: filter function " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* originating subsystem */ if (uevent_ops && uevent_ops->name) subsystem = uevent_ops->name(kobj); else subsystem = kobject_name(&kset->kobj); if (!subsystem) { pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the " "event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* environment buffer */ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); if (!env) return -ENOMEM; /* complete object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { retval = -ENOENT; goto exit; } /* default keys */ retval = add_uevent_var(env, "ACTION=%s", action_string); if (retval) goto exit; retval = add_uevent_var(env, "DEVPATH=%s", devpath); if (retval) goto exit; retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem); if (retval) goto exit; /* keys passed in from the caller */ if (envp_ext) { for (i = 0; envp_ext[i]; i++) { retval = add_uevent_var(env, "%s", envp_ext[i]); if (retval) goto exit; } } /* let the kset specific function add its stuff */ if (uevent_ops && uevent_ops->uevent) { retval = uevent_ops->uevent(kobj, env); if (retval) { pr_debug("kobject: '%s' (%p): %s: uevent() returned " "%d\n", kobject_name(kobj), kobj, __func__, retval); goto exit; } } switch (action) { case KOBJ_ADD: /* * Mark "add" event so we can make sure we deliver "remove" * event to userspace during automatic cleanup. If * the object did send an "add" event, "remove" will * automatically generated by the core, if not already done * by the caller. */ kobj->state_add_uevent_sent = 1; break; case KOBJ_UNBIND: zap_modalias_env(env); break; default: break; } /* we will send an event, so request a new sequence number */ retval = add_uevent_var(env, "SEQNUM=%llu", atomic64_inc_return(&uevent_seqnum)); if (retval) goto exit; retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath); #ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; retval = add_uevent_var(env, "HOME=/"); if (retval) goto exit; retval = add_uevent_var(env, "PATH=/sbin:/bin:/usr/sbin:/usr/bin"); if (retval) goto exit; retval = init_uevent_argv(env, subsystem); if (retval) goto exit; retval = -ENOMEM; info = call_usermodehelper_setup(env->argv[0], env->argv, env->envp, GFP_KERNEL, NULL, cleanup_uevent_env, env); if (info) { retval = call_usermodehelper_exec(info, UMH_NO_WAIT); env = NULL; /* freed by cleanup_uevent_env */ } } #endif exit: kfree(devpath); kfree(env); return retval; } EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * * @kobj: struct kobject that the action is happening to * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_uevent(struct kobject *kobj, enum kobject_action action) { return kobject_uevent_env(kobj, action, NULL); } EXPORT_SYMBOL_GPL(kobject_uevent); /** * add_uevent_var - add key value string to the environment buffer * @env: environment buffer structure * @format: printf format for the key=value pair * * Returns 0 if environment variable was added successfully or -ENOMEM * if no space was available. */ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) { va_list args; int len; if (env->envp_idx >= ARRAY_SIZE(env->envp)) { WARN(1, KERN_ERR "add_uevent_var: too many keys\n"); return -ENOMEM; } va_start(args, format); len = vsnprintf(&env->buf[env->buflen], sizeof(env->buf) - env->buflen, format, args); va_end(args); if (len >= (sizeof(env->buf) - env->buflen)) { WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n"); return -ENOMEM; } env->envp[env->envp_idx++] = &env->buf[env->buflen]; env->buflen += len + 1; return 0; } EXPORT_SYMBOL_GPL(add_uevent_var); #if defined(CONFIG_NET) static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, struct netlink_ext_ack *extack) { /* u64 to chars: 2^64 - 1 = 21 chars */ char buf[sizeof("SEQNUM=") + 21]; struct sk_buff *skbc; int ret; /* bump and prepare sequence number */ ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", atomic64_inc_return(&uevent_seqnum)); if (ret < 0 || (size_t)ret >= sizeof(buf)) return -ENOMEM; ret++; /* verify message does not overflow */ if ((skb->len + ret) > UEVENT_BUFFER_SIZE) { NL_SET_ERR_MSG(extack, "uevent message too big"); return -EINVAL; } /* copy skb and extend to accommodate sequence number */ skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL); if (!skbc) return -ENOMEM; /* append sequence number */ skb_put_data(skbc, buf, ret); /* remove msg header */ skb_pull(skbc, NLMSG_HDRLEN); /* set portid 0 to inform userspace message comes from kernel */ NETLINK_CB(skbc).portid = 0; NETLINK_CB(skbc).dst_group = 1; ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net; int ret; if (!nlmsg_data(nlh)) return -EINVAL; /* * Verify that we are allowed to send messages to the target * network namespace. The caller must have CAP_SYS_ADMIN in the * owning user namespace of the target network namespace. */ net = sock_net(NETLINK_CB(skb).sk); if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) { NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability"); return -EPERM; } ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack); return ret; } static void uevent_net_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &uevent_net_rcv_skb); } static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; struct netlink_kernel_cfg cfg = { .groups = 1, .input = uevent_net_rcv, .flags = NL_CFG_F_NONROOT_RECV }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); if (!ue_sk) return -ENOMEM; ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg); if (!ue_sk->sk) { pr_err("kobject_uevent: unable to create netlink socket!\n"); kfree(ue_sk); return -ENODEV; } net->uevent_sock = ue_sk; /* Restrict uevents to initial user namespace. */ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_add_tail(&ue_sk->list, &uevent_sock_list); mutex_unlock(&uevent_sock_mutex); } return 0; } static void uevent_net_exit(struct net *net) { struct uevent_sock *ue_sk = net->uevent_sock; if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_del(&ue_sk->list); mutex_unlock(&uevent_sock_mutex); } netlink_kernel_release(ue_sk->sk); kfree(ue_sk); } static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, }; static int __init kobject_uevent_init(void) { return register_pernet_subsys(&uevent_net_ops); } postcore_initcall(kobject_uevent_init); #endif #ifdef CONFIG_UEVENT_HELPER static const struct ctl_table uevent_helper_sysctl_table[] = { { .procname = "hotplug", .data = &uevent_helper, .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, .proc_handler = proc_dostring, }, }; static int __init init_uevent_helper_sysctl(void) { register_sysctl_init("kernel", uevent_helper_sysctl_table); return 0; } postcore_initcall(init_uevent_helper_sysctl); #endif
894 837 425 430 430 554 50 397 428 7 9 1 12 3 28 8 32 32 81 81 81 81 78 1 81 23 78 67 219 218 217 216 4 219 18 635 21 517 520 519 348 348 349 348 349 348 604 604 298 207 209 519 520 520 519 520 63 470 112 143 524 81 614 19 18 19 19 19 19 19 19 19 18 164 162 164 6 162 618 618 619 618 617 8 616 616 617 617 19 616 611 436 10 106 551 602 587 21 605 607 607 7 6 13 584 11 11 11 3 6 1 8 11 1 11 9 2 5 5 518 519 354 354 4 518 2 4 2 439 441 441 441 439 440 439 438 3 1 1 1 3 205 208 225 208 208 76 1 1 3 220 221 76 208 225 3 2 8 32 32 32 5 4 2 3 3 1 7 4 11 3 2 6 16 5 19 22 4 16 1 8 7 3 9 13 13 9 4 4 16 1 5 1 1 1 1 1 1 20 21 15 32 9 25 32 294 439 440 3 294 295 296 297 356 356 356 307 50 52 554 554 554 554 553 246 246 246 64 2 35 27 12 12 1 4 7 1 5 1 3 60 60 1 5 1 52 4 2 46 1 1 1 37 10 11 1 36 1 33 31 7 1 5 26 5 27 5 20 1 20 20 1 1 1 1 4 2 1 1 1 1 1 1 2 1 20 15 1 20 20 20 3 3 3 17 20 17 9 1 1 3 4 4 4 4 4 22 22 22 8 22 1 21 20 20 20 19 20 22 462 4 3 45 1 461 3 463 463 5 1 1 1 1 25 9 2 13 12 12 12 11 12 10 11 6 6 2 2 1 1 1 18 10 6 6 2 4 2 6 18 2 16 18 12 18 6 18 1 18 17 14 6 12 17 18 1 4 4 1 14 1 6 1 7 3 21 21 6 4 1 1 1 1 1 4 454 453 454 454 450 596 597 579 588 579 589 597 290 290 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <trace/events/neigh.h> #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) { int i; switch (family) { default: DEBUG_NET_WARN_ON_ONCE(1); fallthrough; /* to avoid panic by null-ptr-deref */ case AF_INET: i = NEIGH_ARP_TABLE; break; case AF_INET6: i = NEIGH_ND_TABLE; break; } return &dev->neighbours[i]; } /* Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static void neigh_mark_dead(struct neighbour *n) { n->dead = 1; if (!list_empty(&n->gc_list)) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } if (!list_empty(&n->managed_list)) list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; /* remove from the gc list if new state is permanent or if neighbor is * externally learned / validated; otherwise entry should be on the gc * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } out: write_unlock(&n->lock); spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; add_to_managed = n->flags & NTF_MANAGED; on_managed_list = !list_empty(&n->managed_list); if (!add_to_managed && on_managed_list) list_del_init(&n->managed_list); else if (add_to_managed && !on_managed_list) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, bool *gc_update, bool *managed_update) { u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; *notify = 1; *gc_update = true; } if ((old_flags ^ ndm_flags) & NTF_MANAGED) { if (ndm_flags & NTF_MANAGED) neigh->flags |= NTF_MANAGED; else neigh->flags &= ~NTF_MANAGED; *notify = 1; *managed_update = true; } if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { if (ndm_flags & NTF_EXT_VALIDATED) neigh->flags |= NTF_EXT_VALIDATED; else neigh->flags &= ~NTF_EXT_VALIDATED; *notify = 1; *gc_update = true; } } bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { bool remove = false; write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; if (++loop == 16) { if (ktime_get_ns() > tmax) goto unlock; loop = 0; } } } WRITE_ONCE(tbl->last_flush, jiffies); unlock: spin_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { /* Use safe distance from the jiffies - LONG_MAX point while timer * is running in DELAY/PROBE state but still show to user space * large times in the past. */ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); neigh_hold(n); if (!time_in_range(n->confirmed, mint, jiffies)) n->confirmed = mint; if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_parms_qlen_dec(struct net_device *dev, int family) { struct neigh_parms *p; rcu_read_lock(); p = neigh_get_dev_parms_rcu(dev, family); if (p) p->qlen--; rcu_read_unlock(); } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, int family) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_one(struct neighbour *n) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. * We must destroy neighbour entry, * but someone still uses it. * * The destroy will be delayed until * the last user releases us, but * we must kill timers etc. and move * it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct hlist_head *dev_head; struct hlist_node *tmp; struct neighbour *n; dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { if (skip_perm && (n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_VALIDATED)) continue; neigh_flush_one(n); } } static void neigh_flush_table(struct neigh_table *tbl) { struct neigh_hash_table *nht; int i; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) neigh_flush_one(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { spin_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } spin_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) timer_delete_sync(&tbl->proxy_timer); return 0; } int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, true); return 0; } EXPORT_SYMBOL(neigh_carrier_down); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; gc_thresh3 = READ_ONCE(tbl->gc_thresh3); if (entries >= gc_thresh3 || (entries >= READ_ONCE(tbl->gc_thresh2) && time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: return n; out_entries: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; } ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); kfree(nht->hash_heads); kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (n->flags & NTF_MANAGED) list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) { err = -ENOBUFS; goto out; } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); update: WRITE_ONCE(n->flags, flags); n->permanent = permanent; WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); return err; } static void pneigh_destroy(struct rcu_head *rcu) { struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); netdev_put(n->dev, &n->dev_tracker); kfree(n); } int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, __rcu **np; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); mutex_lock(&tbl->phash_lock); for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); return 0; } } mutex_unlock(&tbl->phash_lock); return -ENOENT; } static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; mutex_lock(&tbl->phash_lock); for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); list_del(&n->free_node); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); } } static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) kfree(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; struct hlist_node *tmp; struct neighbour *n; unsigned int i; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) neigh_set_reach_time(p); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } if (time_before(n->used, n->confirmed) && time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ spin_unlock_bh(&tbl->lock); cond_resched(); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { if (neigh->nud_state == NUD_PROBE && neigh->flags & NTF_EXT_VALIDATED) { WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh_invalidate(neigh); } notify = 1; goto out; } if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) neigh_update_notify(neigh, 0); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; } else { immediate_probe = true; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ / 100); } neigh_add_timer(neigh, next); } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed or invalidated. Caller MUST hold reference count on the entry. */ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); new = old; goto out; } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; } lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); WRITE_ONCE(neigh->nud_state, new); notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) { struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) neigh_update_is_router(neigh, flags, &notify); write_unlock_bh(&neigh->lock); if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); trace_neigh_update_done(neigh, err); return err; } int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_managed_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, managed_work.work); struct neighbour *neigh; spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } static unsigned long neigh_proxy_delay(struct neigh_parms *p) { /* If proxy_delay is zero, do not call get_random_u32_below() * as it is undefined behavior. */ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); return proxy_delay ? jiffies + get_random_u32_below(proxy_delay) : jiffies; } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } spin_lock_bh(&tbl->lock); list_add_rcu(&p->list, &tbl->parms.list); spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; spin_lock_bh(&tbl->lock); list_del_rcu(&parms->list); parms->dead = 1; spin_unlock_bh(&tbl->lock); netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); INIT_LIST_HEAD(&tbl->managed_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); spin_lock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); /* * Only called from ndisc_cleanup(), which means this is dead code * because we no longer can unload IPv6 module. */ int neigh_table_clear(int index, struct neigh_table *tbl) { RCU_INIT_POINTER(neigh_tables[index], NULL); synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } return tbl; } const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_PROBES] = { .type = NLA_U32 }, [NDA_VLAN] = { .type = NLA_U16 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (!dst_attr) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); spin_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; u32 ndm_flags; int err; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); ndm_flags = ndm->ndm_flags; if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + hweight32(NTF_EXT_MASK))); ndm_flags |= (ext << NTF_EXT_SHIFT); } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } if (!dev) { NL_SET_ERR_MSG(extack, "Device not specified"); goto out; } if (tbl->allow_add && !tbl->allow_add(dev, extack)) { err = -EINVAL; goto out; } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || ndm_flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); err = -EINVAL; goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED will result in the neighbor * being created with an invalid state (NUD_NONE). */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = NUD_NONE; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot create externally validated neighbor with an invalid state"); err = -EINVAL; goto out; } } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & (NTF_EXT_LEARNED | NTF_MANAGED | NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED do not update the existing * state other than clearing it if it was * NUD_PERMANENT. */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot mark neighbor as externally validated with an invalid state"); err = -EINVAL; neigh_release(neigh); goto out; } } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (protocol) neigh->protocol = protocol; if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; if (ndm_flags & NTF_MANAGED) flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; if (ndm_flags & NTF_EXT_VALIDATED) flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - READ_ONCE(tbl->last_flush); long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += READ_ONCE(st->allocs); ndst.ndts_destroys += READ_ONCE(st->destroys); ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); ndst.ndts_res_failed += READ_ONCE(st->res_failed); ndst.ndts_lookups += READ_ONCE(st->lookups); ndst.ndts_hits += READ_ONCE(st->hits); ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; bool found = false; int err, tidx; err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); rcu_read_lock(); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) { rcu_read_unlock(); err = -ENOENT; goto errout; } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_INTERVAL_PROBE_TIME_MS: NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: spin_unlock_bh(&tbl->lock); rcu_read_unlock(); errout: return err; } static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ndtmsg *ndtm; ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); return -EINVAL; } return 0; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; if (cb->strict_check) { int err = neightbl_valid_dump_info(nlh, cb->extack); if (err < 0) return err; } family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; rcu_read_lock(); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: rcu_read_unlock(); cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; neigh_flags = neigh->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { read_unlock_bh(&neigh->lock); goto nla_put_failure; } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags = READ_ONCE(pn->flags); neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; protocol = READ_ONCE(pn->protocol); if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } struct neigh_dump_filter { int master_idx; int dev_idx; }; static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; idx = 0; neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags); if (err < 0) goto out; next: idx++; } } out: cb->args[1] = h; cb->args[2] = idx; return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); if (err < 0) goto out; next: idx++; } } out: cb->args[3] = h; cb->args[4] = idx; return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, bool strict_check, struct neigh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; int err, i; if (strict_check) { struct ndmsg *ndm; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } if (err < 0) return err; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: filter->master_idx = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); return -EINVAL; } } } return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; err = 0; s_t = cb->args[0]; rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb, &filter); else err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = t; return err; } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { NL_SET_ERR_MSG(extack, "No device specified"); return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: if (!tb[i]) { NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); return ERR_PTR(-EINVAL); } break; default: if (!tb[i]) continue; NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return ERR_PTR(-EINVAL); } } return ndm; } static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; u32 seq = nlh->nlmsg_seq; struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; void *dst; int err; ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); if (ndm->ndm_flags & NTF_PROXY) skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); else skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; rcu_read_lock(); tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); err = -EAFNOSUPPORT; goto err_unlock; } if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; goto err_unlock; } dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; goto err_unlock; } } if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) goto err_unlock; } else { neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); neigh_release(neigh); if (err) goto err_unlock; } rcu_read_unlock(); return rtnl_unicast(skb, net, pid); err_unlock: rcu_read_unlock(); kfree_skb(skb); return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock(); nht = rcu_dereference(tbl->nht); spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { struct neigh_hash_table *nht; int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); if (!tbl) goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock(); goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_valid(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); if (!net_eq(dev_net(n->dev), net)) return NULL; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); if (!v) return NULL; if (pos) return v; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) return n; if (READ_ONCE(n->nud_state) & ~NUD_NOARP) return n; return NULL; } static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; while (++state->bucket < (1 << nht->hash_shift)) { neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { tmp = neigh_get_valid(seq, n, NULL); if (tmp) return tmp; } } return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } hlist_for_each_entry_continue(n, hash) { tmp = neigh_get_valid(seq, n, pos); if (tmp) { n = tmp; goto out; } } n = neigh_get_first(seq); out: if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx " "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct sk_buff *skb; int err = -ENOBUFS; struct net *net; rcu_read_lock(); net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); out: rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; int min = msecs_to_jiffies(1); tmp.extra1 = &min; tmp.extra2 = NULL; ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ neigh_set_reach_time(p); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), neigh_path, t->neigh_vars, neigh_vars_size); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; static int __init neigh_init(void) { rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init);
495 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 // SPDX-License-Identifier: GPL-2.0 /* * Deferred user space unwinding */ #include <linux/sched/task_stack.h> #include <linux/unwind_deferred.h> #include <linux/sched/clock.h> #include <linux/task_work.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/mm.h> /* * For requesting a deferred user space stack trace from NMI context * the architecture must support a safe cmpxchg in NMI context. * For those architectures that do not have that, then it cannot ask * for a deferred user space stack trace from an NMI context. If it * does, then it will get -EINVAL. */ #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) # define CAN_USE_IN_NMI 1 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) { u32 old = 0; return try_cmpxchg(&info->id.cnt, &old, cnt); } #else # define CAN_USE_IN_NMI 0 /* When NMIs are not allowed, this always succeeds */ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) { info->id.cnt = cnt; return true; } #endif /* Make the cache fit in a 4K page */ #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) /* Guards adding to or removing from the list of callbacks */ static DEFINE_MUTEX(callback_mutex); static LIST_HEAD(callbacks); #define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) /* Zero'd bits are available for assigning callback users */ static unsigned long unwind_mask = RESERVED_BITS; DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* * This is a unique percpu identifier for a given task entry context. * Conceptually, it's incremented every time the CPU enters the kernel from * user space, so that each "entry context" on the CPU gets a unique ID. In * reality, as an optimization, it's only incremented on demand for the first * deferred unwind request after a given entry-from-user. * * It's combined with the CPU id to make a systemwide-unique "context cookie". */ static DEFINE_PER_CPU(u32, unwind_ctx_ctr); /* * The context cookie is a unique identifier that is assigned to a user * space stacktrace. As the user space stacktrace remains the same while * the task is in the kernel, the cookie is an identifier for the stacktrace. * Although it is possible for the stacktrace to get another cookie if another * request is made after the cookie was cleared and before reentering user * space. */ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; lockdep_assert_irqs_disabled(); if (info->id.cpu) return info->id.id; /* LSB is always set to ensure 0 is an invalid value */ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; if (try_assign_cnt(info, cnt)) { /* Update the per cpu counter */ __this_cpu_write(unwind_ctx_ctr, cnt); } /* Interrupts are disabled, the CPU will always be same */ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ return info->id.id; } /** * unwind_user_faultable - Produce a user stacktrace in faultable context * @trace: The descriptor that will store the user stacktrace * * This must be called in a known faultable context (usually when entering * or exiting user space). Depending on the available implementations * the @trace will be loaded with the addresses of the user space stacktrace * if it can be found. * * Return: 0 on success and negative on error * On success @trace will contain the user space stacktrace */ int unwind_user_faultable(struct unwind_stacktrace *trace) { struct unwind_task_info *info = &current->unwind_info; struct unwind_cache *cache; /* Should always be called from faultable context */ might_fault(); if (!current->mm) return -EINVAL; if (!info->cache) { info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), GFP_KERNEL); if (!info->cache) return -ENOMEM; } cache = info->cache; trace->entries = cache->entries; trace->nr = cache->nr_entries; /* * The user stack has already been previously unwound in this * entry context. Skip the unwind and use the cache. */ if (trace->nr) return 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } static void process_unwind_deferred(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; struct unwind_stacktrace trace; struct unwind_work *work; unsigned long bits; u64 cookie; if (WARN_ON_ONCE(!unwind_pending(info))) return; /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. */ trace.nr = 0; trace.entries = NULL; unwind_user_faultable(&trace); if (info->cache) bits &= ~(info->cache->unwind_completed); cookie = info->id.id; guard(srcu)(&unwind_srcu); list_for_each_entry_srcu(work, &callbacks, list, srcu_read_lock_held(&unwind_srcu)) { if (test_bit(work->bit, &bits)) { work->func(work, &trace, cookie); if (info->cache) info->cache->unwind_completed |= BIT(work->bit); } } } static void unwind_deferred_task_work(struct callback_head *head) { process_unwind_deferred(current); } void unwind_deferred_task_exit(struct task_struct *task) { struct unwind_task_info *info = &current->unwind_info; if (!unwind_pending(info)) return; process_unwind_deferred(task); task_work_cancel(task, &info->work); } /** * unwind_deferred_request - Request a user stacktrace on task kernel exit * @work: Unwind descriptor requesting the trace * @cookie: The cookie of the first request made for this task * * Schedule a user space unwind to be done in task work before exiting the * kernel. * * The returned @cookie output is the generated cookie of the very first * request for a user space stacktrace for this task since it entered the * kernel. It can be from a request by any caller of this infrastructure. * Its value will also be passed to the callback function. It can be * used to stitch kernel and user stack traces together in post-processing. * * It's valid to call this function multiple times for the same @work within * the same task entry context. Each call will return the same cookie * while the task hasn't left the kernel. If the callback is not pending * because it has already been previously called for the same entry context, * it will be called again with the same stack trace and cookie. * * Return: 0 if the callback successfully was queued. * 1 if the callback is pending or was already executed. * Negative if there's an error. * @cookie holds the cookie of the first request by any user */ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = &current->unwind_info; int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; *cookie = 0; if ((current->flags & (PF_KTHREAD | PF_EXITING)) || !user_mode(task_pt_regs(current))) return -EINVAL; /* * NMI requires having safe cmpxchg operations. * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ if (in_nmi()) { if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) return -EINVAL; twa_mode = TWA_NMI_CURRENT; } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); if (WARN_ON_ONCE(bit < 0)) return -EINVAL; /* Only need the mask now */ bit = BIT(bit); guard(irqsave)(); *cookie = get_cookie(info); old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) return 1; /* * This work's bit hasn't been set yet. Now set it with the PENDING * bit and fetch the current value of unwind_mask. If ether the * work's bit or PENDING was already set, then this is already queued * to have a callback. */ bits = UNWIND_PENDING | bit; old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better * have also set pending and queued a callback. */ WARN_ON_ONCE(!(old & UNWIND_PENDING)); return old & bit; } /* The work has been claimed, now schedule it. */ ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) atomic_long_set(&info->unwind_mask, 0); return ret; } void unwind_deferred_cancel(struct unwind_work *work) { struct task_struct *g, *t; int bit; if (!work) return; bit = work->bit; /* No work should be using a reserved bit */ if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) return; guard(mutex)(&callback_mutex); list_del_rcu(&work->list); /* Do not allow any more requests and prevent callbacks */ work->bit = -1; __clear_bit(bit, &unwind_mask); synchronize_srcu(&unwind_srcu); guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { atomic_long_andnot(BIT(bit), &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } } int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { memset(work, 0, sizeof(*work)); guard(mutex)(&callback_mutex); /* See if there's a bit in the mask available */ if (unwind_mask == ~0UL) return -EBUSY; work->bit = ffz(unwind_mask); __set_bit(work->bit, &unwind_mask); list_add_rcu(&work->list, &callbacks); work->func = func; return 0; } void unwind_task_init(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; kfree(info->cache); task_work_cancel(task, &info->work); }
1112 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* Copy ready events to userspace */ int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
10 3 13 6 13 13 4 4 4 8 8 8 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-or-later /* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv4/xfrm4_protocol.c */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; static DEFINE_MUTEX(xfrm6_protocol_mutex); static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_handlers; case IPPROTO_AH: return &ah6_handlers; case IPPROTO_COMP: return &ipcomp6_handlers; } return NULL; } #define for_each_protocol_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(protocol); if (!head) return 0; for_each_protocol_rcu(*proto_handlers(protocol), handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; return 0; } int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { int ret; struct xfrm6_protocol *handler; struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); if (!head) goto out; if (!skb_dst(skb)) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); if (dst->error) goto drop; skb_dst_set(skb, dst); } for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm6_rcv_encap); static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(esp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ah6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) { int ret; struct xfrm6_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; for_each_protocol_rcu(ipcomp6_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol esp6_protocol = { .handler = xfrm6_esp_rcv, .err_handler = xfrm6_esp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ah6_protocol = { .handler = xfrm6_ah_rcv, .err_handler = xfrm6_ah_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol ipcomp6_protocol = { .handler = xfrm6_ipcomp_rcv, .err_handler = xfrm6_ipcomp_err, .flags = INET6_PROTO_NOPOLICY, }; static const struct xfrm_input_afinfo xfrm6_input_afinfo = { .family = AF_INET6, .callback = xfrm6_rcv_cb, }; static inline const struct inet6_protocol *netproto(unsigned char protocol) { switch (protocol) { case IPPROTO_ESP: return &esp6_protocol; case IPPROTO_AH: return &ah6_protocol; case IPPROTO_COMP: return &ipcomp6_protocol; } return NULL; } int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; bool add_netproto = false; int ret = -EEXIST; int priority = handler->priority; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) add_netproto = true; for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t->priority < priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&xfrm6_protocol_mutex); if (add_netproto) { if (inet6_add_protocol(netproto(protocol), protocol)) { pr_err("%s: can't add protocol\n", __func__); ret = -EAGAIN; } } return ret; } EXPORT_SYMBOL(xfrm6_protocol_register); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol) { struct xfrm6_protocol __rcu **pprev; struct xfrm6_protocol *t; int ret = -ENOENT; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm6_protocol_mutex); for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm6_protocol_mutex))) { if (inet6_del_protocol(netproto(protocol), protocol) < 0) { pr_err("%s: can't remove protocol\n", __func__); ret = -EAGAIN; } } mutex_unlock(&xfrm6_protocol_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_protocol_deregister); int __init xfrm6_protocol_init(void) { return xfrm_input_register_afinfo(&xfrm6_input_afinfo); } void xfrm6_protocol_fini(void) { xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); }
9717 358 8218 28 8198 8218 187 187 8 8 76 6 109 595 1553 599 495 1548 6 5 10 35 500 270 595 1605 1626 3624 16 239 83 78 78 1148 1634 78 572 572 1552 1559 1556 494 495 494 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* Anonymous memory (and shmem) */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_owner_2, /* * Set if all buffer heads in the folio are mapped. * Filesystems which do not use BHs can use it for their own purpose. */ PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, #ifdef CONFIG_MIGRATION /* movable_ops page that is isolated for migration */ PG_movable_ops_isolated = PG_reclaim, /* this is a movable_ops page (for selected typed pages only) */ PG_movable_ops = PG_uptodate, #endif /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } static __always_inline bool page_count_writable(const struct page *page, int u) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return true; /* * The refcount check is ordered before the fake-head check to prevent * the following race: * CPU 1 (HVO) CPU 2 (speculative PFN walker) * * page_ref_freeze() * synchronize_rcu() * rcu_read_lock() * page_is_fake_head() is false * vmemmap_remap_pte() * XXX: struct page[] becomes r/o * * page_ref_unfreeze() * page_ref_count() is not zero * * atomic_add_unless(&page->_refcount) * XXX: try to modify r/o struct page[] * * The refcount check also prevents modification attempts to other (r/o) * tail pages that are not fake heads. */ if (atomic_read_acquire(&page->_refcount) == u) return false; return page_fixed_fake_head(page) == page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } static inline bool page_count_writable(const struct page *page, int u) { return true; } #endif static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static const unsigned long *const_folio_flags(const struct folio *folio, unsigned n) { const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 #define FOLIO_HEAD_PAGE 0 #define FOLIO_SECOND_PAGE 1 /* * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ static __always_inline bool folio_test_##name(const struct folio *folio) \ { return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ { set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void folio_clear_##name(struct folio *folio) \ { clear_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_SET_FLAG(name, page) \ static __always_inline void __folio_set_##name(struct folio *folio) \ { __set_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void __folio_clear_##name(struct folio *folio) \ { __clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_SET_FLAG(name, page) \ static __always_inline bool folio_test_set_##name(struct folio *folio) \ { return test_and_set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_CLEAR_FLAG(name, page) \ static __always_inline bool folio_test_clear_##name(struct folio *folio) \ { return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_FLAG(name, page) \ FOLIO_TEST_FLAG(name, page) \ FOLIO_SET_FLAG(name, page) \ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define FOLIO_TEST_FLAG_FALSE(name) \ static inline bool folio_test_##name(const struct folio *folio) \ { return false; } #define FOLIO_SET_FLAG_NOOP(name) \ static inline void folio_set_##name(struct folio *folio) { } #define FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void folio_clear_##name(struct folio *folio) { } #define __FOLIO_SET_FLAG_NOOP(name) \ static inline void __folio_set_##name(struct folio *folio) { } #define __FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void __folio_clear_##name(struct folio *folio) { } #define FOLIO_TEST_SET_FLAG_FALSE(name) \ static inline bool folio_test_set_##name(struct folio *folio) \ { return false; } #define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ static inline bool folio_test_clear_##name(struct folio *folio) \ { return false; } #define FOLIO_FLAG_FALSE(name) \ FOLIO_TEST_FLAG_FALSE(name) \ FOLIO_SET_FLAG_NOOP(name) \ FOLIO_CLEAR_FLAG_NOOP(name) #define TESTPAGEFLAG_FALSE(uname, lname) \ FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ __FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) FOLIO_FLAG(active, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP #define folio_test_partial_kmap(f) true #else #define folio_test_partial_kmap(f) folio_test_highmem(f) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, const_folio_flags(folio, 0)); } FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(swapcache) #endif FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(mlocked) __FOLIO_CLEAR_FLAG_NOOP(mlocked) FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* See page_idle.h for !64BIT workaround */ #else /* !CONFIG_PAGE_IDLE_FLAG */ FOLIO_FLAG_FALSE(young) FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; * with the FOLIO_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged folio. See ksm.h. * * Please note that, confusingly, "folio_mapping" refers to the inode * address_space which maps the folio from disk; whereas "folio_mapped" * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the folio->mapping does not exist as such, nor do * these flags below. So in order to avoid testing non-existent bits, * please make sure that folio_test_slab(folio) actually evaluates to * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ #define FOLIO_MAPPING_ANON 0x1 #define FOLIO_MAPPING_ANON_KSM 0x2 #define FOLIO_MAPPING_KSM (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) #define FOLIO_MAPPING_FLAGS (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. * @folio: The folio. * @mask: Bits set in this word will be changed. * * This must only be used for flags which are changed with the folio * lock held. For example, it is unsafe to use for PG_dirty as that * can be set without the folio lock held. It can also only be used * on flags which are in the range 0-6 as some of the implementations * only affect those bits. * * Return: Whether there are tasks waiting on the folio. */ static inline bool folio_xor_flags_has_waiters(struct folio *folio, unsigned long mask) { return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); } /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(const struct folio *folio) { bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline bool PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) void __folio_start_writeback(struct folio *folio, bool keep_write); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) static __always_inline bool folio_test_head(const struct folio *folio) { return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } #else TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* * For pages that do not use mapcount, page_type may be used. * The low 24 bits of pagetype may be used for your own purposes, as long * as you are careful to not affect the top 8 bits. The low bits of * pagetype will be overwritten when you clear the page_type from the page. */ enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ PGTY_buddy = 0xf0, PGTY_offline = 0xf1, PGTY_table = 0xf2, PGTY_guard = 0xf3, PGTY_hugetlb = 0xf4, PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; static inline bool page_type_has_type(int page_type) { return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ static inline bool page_mapcount_is_type(unsigned int mapcount) { return page_type_has_type(mapcount - 1); } static inline bool page_has_type(const struct page *page) { return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ if (folio_test_##fname(folio)) \ return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ if (folio->page.page_type == UINT_MAX) \ return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ if (Page##uname(page)) \ return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ if (page->page_type == UINT_MAX) \ return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * When a memory block gets onlined, all pages are initialized with a * refcount of 1 and PageOffline(). generic_online_page() will * take care of clearing PageOffline(). * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free (unmanaged) * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * * Memory offlining code will not adjust the managed page count for any * PageOffline() pages, treating them like they were never exposed to the * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else FOLIO_TEST_FLAG_FALSE(hugetlb) #endif PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* * Mark pages that has to be accepted before touched for the first time. * * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. * * Context: Any context. * Return: True for hugetlbfs pages, false for anon pages or pages * belonging to other filesystems. */ static inline bool PageHuge(const struct page *page) { return folio_test_hugetlb(page_folio(page)); } /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(const struct page *page) { const struct folio *folio; if (PageHWPoison(page)) return true; folio = page_folio(page); return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } static inline bool folio_contain_hwpoisoned_page(struct folio *folio) { return folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); } bool is_free_buddy_page(const struct page *page); #ifdef CONFIG_MIGRATION /* * This page is migratable through movable_ops (for selected typed pages * only). * * Page migration of such pages might fail, for example, if the page is * already isolated by somebody else, or if the page is about to get freed. * * While a subsystem might set selected typed pages that support page migration * as being movable through movable_ops, it must never clear this flag. * * This flag is only cleared when the page is freed back to the buddy. * * Only selected page types support this flag (see page_movable_ops()) and * the flag might be used in other context for other pages. Always use * page_has_movable_ops() instead. */ TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); /* * A movable_ops page has this flag set while it is isolated for migration. * This flag primarily protects against concurrent migration attempts. * * Once migration ended (success or failure), the flag is cleared. The * flag is managed by the migration core. */ PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL); #else /* !CONFIG_MIGRATION */ TESTPAGEFLAG_FALSE(MovableOps, movable_ops); SETPAGEFLAG_NOOP(MovableOps, movable_ops); PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated); #endif /* CONFIG_MIGRATION */ /** * page_has_movable_ops - test for a movable_ops page * @page: The page to test. * * Test whether this is a movable_ops page. Such pages will stay that * way until freed. * * Returns true if this is a movable_ops page, otherwise false. */ static inline bool page_has_movable_ops(const struct page *page) { return PageMovableOps(page) && (PageOffline(page) || PageZsmalloc(page)); } static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); /* * HugeTLB stores this information on the head page; THP keeps it per * page */ if (PageHuge(page)) page = compound_head(page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * folio_has_private - Determine if folio has private stuff * @folio: The folio to be checked * * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ static inline int folio_has_private(const struct folio *folio) { return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
92 92 92 92 92 92 10 10 92 92 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, further decode to determine its use: * * FineIBT: d6 udb * FineIBT: f0 75 f9 lock jne . - 6 * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * * Notable, since __WARN_trap can use all registers, the distinction between * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; u8 v, reg, rm, rex = 0; int type = BUG_UD1; bool lock = false; if (addr < TASK_SIZE_MAX) return BUG_NONE; for (;;) { v = *(u8 *)(addr++); if (v == INSN_ASOP) continue; if (v == INSN_LOCK) { lock = true; continue; } if ((v & 0xf0) == 0x40) { rex = v; continue; } break; } switch (v) { case 0x70 ... 0x7f: /* Jcc.d8 */ addr += 1; /* d8 */ *len = addr - start; WARN_ON_ONCE(!lock); return BUG_LOCK; case 0xd6: *len = addr - start; return BUG_UDB; case OPCODE_ESCAPE: break; default: return BUG_NONE; } v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) { *len = addr - start; return BUG_UD2; } if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; *imm = 0; v = *(u8 *)(addr++); /* ModRM */ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) addr += 4; /* RIP + disp32 */ if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; if (rm == 2) { /* (%edx) */ *imm = reg; type = BUG_UD1_WARN; } break; case 1: *imm = *(s8 *)addr; addr += 1; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 3: break; } /* record instruction length */ *len = addr - start; return type; } static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) { int offset = pt_regs_offset(regs, nr); if (WARN_ON_ONCE(offset < -0)) return 0; return *((unsigned long *)((void *)regs + offset)); } #ifdef HAVE_ARCH_BUG_FORMAT_ARGS DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); EXPORT_STATIC_CALL_TRAMP(WARN_trap); /* * Create a va_list from an exception context. */ void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) { /* * Register save area; populate with function call argument registers */ args->regs[0] = regs->di; args->regs[1] = regs->si; args->regs[2] = regs->dx; args->regs[3] = regs->cx; args->regs[4] = regs->r8; args->regs[5] = regs->r9; /* * From the ABI document: * * @gp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available general purpose * argument register is saved. In case all argument registers have * been exhausted, it is set to the value 48 (6*8). * * @fp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available floating point * argument is saved. In case all argument registers have been * exhausted, it is set to the value 176 (6*8 + 8*16) * * @overflow_arg_area - this pointer is used to fetch arguments passed * on the stack. It is initialized with the address of the first * argument passed on the stack, if any, and then always updated to * point to the start of the next argument on the stack. * * @reg_save_area - the element points to the start of the register * save area. * * Notably the vararg starts with the second argument and there are no * floating point arguments in the kernel. */ args->args.gp_offset = 1*8; args->args.fp_offset = 6*8 + 8*16; args->args.reg_save_area = &args->regs; args->args.overflow_arg_area = (void *)regs->sp; /* * If the exception came from __WARN_trap, there is a return * address on the stack, skip that. This is why any __WARN_trap() * caller must inhibit tail-call optimization. */ if ((void *)regs->ip == &__WARN_trap) args->args.overflow_arg_area += 8; return &args->args; } #endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } static noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; int ud_type, ud_len; s32 ud_imm; ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); switch (ud_type) { case BUG_UD1_WARN: if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) handled = true; break; case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } fallthrough; case BUG_UDB: case BUG_LOCK: if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } break; case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", report_ubsan_failure(ud_imm), (void *)regs->ip); } break; default: break; } /* * When continuing, and regs->ip hasn't changed, move it to the next * instruction. When not continuing execution, restore the instruction * pointer. */ if (handled) { if (regs->ip == addr) regs->ip += ud_len; } else { regs->ip = addr; } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch * between configs triggers objtool warnings. * * This is a temporary hack until we have compiler or plugin support for * annotating noreturns. */ #ifdef CONFIG_X86_ESPFIX64 #define always_true() true #else bool always_true(void); bool __weak always_true(void) { return true; } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); if (always_true()) panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL, GP_LASS_VIOLATION, GP_NULL_POINTER, }; static const char * const kernel_gp_hint_help[] = { [GP_NON_CANONICAL] = "probably for non-canonical address", [GP_CANONICAL] = "maybe for address", [GP_LASS_VIOLATION] = "probably LASS violation for address", [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical or * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* Operand is in the kernel half */ if (*addr >= ~__VIRTUAL_MASK) return GP_CANONICAL; /* The last byte of the operand is not in the user canonical half */ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; /* * A NULL pointer dereference usually causes a #PF. However, it * can result in a #GP when LASS is active. Provide the same * hint in the rare case that the condition is hit without LASS. */ if (*addr < PAGE_SIZE) return GP_NULL_POINTER; /* * Assume that LASS caused the exception, because the address is * canonical and in the user half. */ if (cpu_feature_enabled(X86_FEATURE_LASS)) return GP_LASS_VIOLATION; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = &current->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = current_top_of_stack(); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; get_debugreg(dr6, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3 of DR6. * * BLD induced #DB clears DR6.BLD and any other debug * exception doesn't modify DR6.BLD. * * RTM induced #DB clears DR6.RTM and any other debug * exception sets DR6.RTM. * * To avoid confusion in identifying debug exceptions, * debug handlers should set DR6.BLD and DR6.RTM, and * clear other DR6 bits before returning. * * Keep it simple: write DR6 with its architectural reset * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ set_debugreg(DR6_RESERVED, 6); return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. * * For FRED, nested #DB should just work fine. But when a watchpoint or * breakpoint is set in the code path which is executed by #DB handler, * it results in an endless recursion and stack overflow. Thus we stay * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #DB needs to be handled on different stack: User #DB on * current task stack, while kernel #DB on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #DB dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception * entry stub doesn't do stack switch. */ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* CONFIG_X86_FRED */ #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_setup_traps(); cpu_init(); }
10 34 10 8 8 8 8 97 7 97 3 97 97 4 1 8 32 32 32 32 32 1235 1238 187 188 32 32 32 32 32 13 11 2 97 97 8 3 2 3 3 132 132 132 132 132 6 6 6 6 6 1 17 16 6 6 15 2 21 18 3 2 2 2 2 2 10 10 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } #define __ip6mr_get_table ip6mr_get_table static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(const struct net *net) { return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, }; int __init ip6_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_many(ip6mr_rtnl_msg_handlers); if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void __init ip6_mr_cleanup(void) { rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); return -1; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return -1; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; return 0; } static void ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct net_device *indev = skb->dev; if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, indev, skb->dev, ip6mr_forward2_finish); return; out_free: kfree_skb(skb); } static void ip6mr_output2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) goto out_free; ip6_output(net, NULL, skb); return; out_free: kfree_skb(skb); } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* Called under rcu_read_lock() */ static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int ct; WARN_ON_ONCE(!rcu_read_lock_held()); atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); /* Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2; skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_output2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_output2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct net *net = dev_net_rcu(dev); struct mfc6_cache *cache; struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = dev->ifindex, .flowi6_mark = skb->mark, }; int err; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct flowi6 fl6 = (struct flowi6) { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; struct mfc6_cache *cache; struct mr_table *mrt; int err; int vif; guard(rcu)(); if (IP6CB(skb)->flags & IP6SKB_FORWARDED) goto ip6_output; if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) goto ip6_output; err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) return ip6mr_cache_unresolved(mrt, vif, skb, dev); goto ip6_output; } /* Wrong interface */ vif = cache->_c.mfc_parent; if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) goto ip6_output; ip6_mr_output_finish(net, mrt, dev, skb, cache); return 0; ip6_output: return ip6_output(net, sk, skb); } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); rcu_read_lock(); mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); }
39 780 780 778 215 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. * The owner is initialized to the value provided by the caller of * make_device_exclusive(), such that this caller can filter out these * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB * which shares page-tables with the CPU. The * invalidate_range_start()/end() callbacks should not be implemented as * invalidate_secondary_tlbs() already catches the points in time when * an external TLB needs to be flushed. * * This requires arch_invalidate_secondary_tlbs() to be called while * holding the ptl spin-lock and therefore this callback is not allowed * to sleep. * * This is called by architecture code whenever invalidating a TLB * entry. It is assumed that any secondary TLB has the same rules for * when invalidations are required. If this is not the case architecture * code will need to call this explicitly when required for secondary * TLB invalidation. */ void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } /* * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it * can return an error if a notifier can't proceed without blocking, in which * case you're not allowed to modify PTEs in the specified range. * * This is mainly intended for OOM handling. */ static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct mm_struct *mm, unsigned long start, unsigned long end) { range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */
223 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 // SPDX-License-Identifier: GPL-2.0 /* Lock down the kernel * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #include <linux/security.h> #include <linux/export.h> #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> static enum lockdown_reason kernel_locked_down; static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_CONFIDENTIALITY_MAX}; /* * Put the kernel into lock-down mode. */ static int lock_kernel_down(const char *where, enum lockdown_reason level) { if (kernel_locked_down >= level) return -EPERM; kernel_locked_down = level; pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n", where); return 0; } static int __init lockdown_param(char *level) { if (!level) return -EINVAL; if (strcmp(level, "integrity") == 0) lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX); else if (strcmp(level, "confidentiality") == 0) lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX); else return -EINVAL; return 0; } early_param("lockdown", lockdown_param); /** * lockdown_is_locked_down - Find out if the kernel is locked down * @what: Tag to use in notice generated if lockdown is in effect */ static int lockdown_is_locked_down(enum lockdown_reason what) { if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX, "Invalid lockdown reason")) return -EPERM; if (kernel_locked_down >= what) { if (lockdown_reasons[what]) pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", current->comm, lockdown_reasons[what]); return -EPERM; } return 0; } static struct security_hook_list lockdown_hooks[] __ro_after_init = { LSM_HOOK_INIT(locked_down, lockdown_is_locked_down), }; static const struct lsm_id lockdown_lsmid = { .name = "lockdown", .id = LSM_ID_LOCKDOWN, }; static int __init lockdown_lsm_init(void) { #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY) lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX); #elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY) lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX); #endif security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks), &lockdown_lsmid); return 0; } static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { char temp[80] = ""; int i, offset = 0; for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; if (lockdown_reasons[level]) { const char *label = lockdown_reasons[level]; if (kernel_locked_down == level) offset += sprintf(temp+offset, "[%s] ", label); else offset += sprintf(temp+offset, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (offset > 0) temp[offset-1] = '\n'; return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); } static ssize_t lockdown_write(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { char *state; int i, len, err = -EINVAL; state = memdup_user_nul(buf, n); if (IS_ERR(state)) return PTR_ERR(state); len = strlen(state); if (len && state[len-1] == '\n') { state[len-1] = '\0'; len--; } for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) { enum lockdown_reason level = lockdown_levels[i]; const char *label = lockdown_reasons[level]; if (label && !strcmp(state, label)) err = lock_kernel_down("securityfs", level); } kfree(state); return err ? err : n; } static const struct file_operations lockdown_ops = { .read = lockdown_read, .write = lockdown_write, }; static int __init lockdown_secfs_init(void) { struct dentry *dentry; dentry = securityfs_create_file("lockdown", 0644, NULL, NULL, &lockdown_ops); return PTR_ERR_OR_ZERO(dentry); } #ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY DEFINE_EARLY_LSM(lockdown) = { #else DEFINE_LSM(lockdown) = { #endif .id = &lockdown_lsmid, .init = lockdown_lsm_init, .initcall_core = lockdown_secfs_init, };
122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * x86-optimized SHA-512 block function * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); #define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha512_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha512_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { static_call(sha512_blocks_x86)(state, data, nblocks); } #define sha512_mod_init_arch sha512_mod_init_arch static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha512_blocks_x86, sha512_blocks_avx2); else static_call_update(sha512_blocks_x86, sha512_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); } }
251 5 252 251 252 252 3 11 11 11 11 11 11 11 11 11 2 9 9 11 1 1 263 261 263 263 259 239 12 12 12 233 233 125 6 6 1 5 6 172 253 252 253 253 253 219 213 16 211 16 211 215 231 1 220 220 208 214 230 227 13 13 37 20 16 13 16 13 13 113 114 114 121 2 119 117 119 3 114 117 121 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps. */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <linux/ksm.h> #include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" #include "../../lib/kstrtox.h" /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable. */ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; enum proc_mem_force { PROC_MEM_FORCE_ALWAYS, PROC_MEM_FORCE_PTRACE, PROC_MEM_FORCE_NEVER }; static enum proc_mem_force proc_mem_force_override __ro_after_init = IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : PROC_MEM_FORCE_ALWAYS; static const struct constant_table proc_mem_force_table[] __initconst = { { "always", PROC_MEM_FORCE_ALWAYS }, { "ptrace", PROC_MEM_FORCE_PTRACE }, { "never", PROC_MEM_FORCE_NEVER }, { } }; static int __init early_proc_mem_force_override(char *buf) { if (!buf) return -EINVAL; /* * lookup_constant() defaults to proc_mem_force_override to preseve * the initial Kconfig choice in case an invalid param gets passed. */ proc_mem_force_override = lookup_constant(proc_mem_force_table, buf, proc_mem_force_override); return 0; } early_param("proc_mem.force_override", early_proc_mem_force_override); struct pid_entry { const char *name; unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; }; #define NOD(NAME, MODE, IOP, FOP, OP) { \ .name = (NAME), \ .len = sizeof(NAME) - 1, \ .mode = MODE, \ .iop = IOP, \ .fop = FOP, \ .op = OP, \ } #define DIR(NAME, MODE, iops, fops) \ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) #define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) #define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; } return count; } static int get_task_root(struct task_struct *task, struct path *root) { int result = -ENOENT; task_lock(task); if (task->fs) { get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } static int proc_cwd_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { task_lock(task); if (task->fs) { get_fs_pwd(task->fs, path); result = 0; } task_unlock(task); put_task_struct(task); } return result; } static int proc_root_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { result = get_task_root(task, path); put_task_struct(task); } return result; } /* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page. */ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, size_t count, unsigned long pos, unsigned long arg_start) { char *page; int ret, got; if (pos >= PAGE_SIZE) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got); /* Include the NUL character if it was found */ if (len < got) len++; if (len > pos) { len -= pos; if (len > count) len = count; len -= copy_to_user(buf, page+pos, len); if (!len) len = -EFAULT; ret = len; } } free_page((unsigned long)page); return ret; } static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) return 0; spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); if (arg_start >= arg_end) return 0; /* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area. */ if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ pos = *ppos; if (pos >= len) return 0; if (count > len - pos) count = len - pos; if (!count) return 0; /* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'. */ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start); /* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range. */ pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos) count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; len = 0; while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break; got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; break; } pos += got; buf += got; len += got; count -= got; } free_page((unsigned long)page); return len; } static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, size_t count, loff_t *pos) { struct mm_struct *mm; ssize_t ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); return ret; } static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct task_struct *tsk; ssize_t ret; BUG_ON(*pos < 0); tsk = get_proc_task(file_inode(file)); if (!tsk) return -ESRCH; ret = get_task_cmdline(tsk, buf, count, pos); put_task_struct(tsk); if (ret > 0) *pos += ret; return ret; } static const struct file_operations proc_pid_cmdline_ops = { .read = proc_pid_cmdline_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); return 0; } print0: seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; } static void unlock_trace(struct task_struct *task) { up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long *entries; int err; /* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; err = lock_trace(task); if (!err) { unsigned int i, nr_entries; nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, 0); for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } kfree(entries); return err; } #endif #ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); return 0; } #endif #ifdef CONFIG_LATENCYTOP static int lstats_show_proc(struct seq_file *m, void *v) { int i; struct inode *inode = m->private; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; seq_printf(m, "%i %li %li", lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; if (!bt) break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); } } put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; clear_tsk_latency_tracing(task); put_task_struct(task); return count; } static const struct file_operations proc_lstats_operations = { .open = lstats_open, .read = seq_read, .write = lstats_write, .llseek = seq_lseek, .release = single_release, }; #endif static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it. */ if (badness != LONG_MIN) points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; seq_printf(m, "%lu\n", points); return 0; } struct limit_names { const char *name; const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, [RLIMIT_CORE] = {"Max core file size", "bytes"}, [RLIMIT_RSS] = {"Max resident set", "bytes"}, [RLIMIT_NPROC] = {"Max processes", "processes"}, [RLIMIT_NOFILE] = {"Max open files", "files"}, [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, [RLIMIT_AS] = {"Max address space", "bytes"}, [RLIMIT_LOCKS] = {"Max file locks", "locks"}, [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, [RLIMIT_NICE] = {"Max nice priority", NULL}, [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, }; /* Display limits for a process */ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned int i; unsigned long flags; struct rlimit rlim[RLIM_NLIMITS]; if (!lock_task_sighand(task, &flags)) return 0; memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); /* * print the file header */ seq_puts(m, "Limit " "Soft Limit " "Hard Limit " "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); else seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) seq_printf(m, "%-10s\n", lnames[i].unit); else seq_putc(m, '\n'); } return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct syscall_info info; u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); else if (info.data.nr < 0) seq_printf(m, "%d 0x%llx 0x%llx\n", info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ /* permission checks */ static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ task = get_proc_task(inode); if (task) { allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; } int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } /* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all. */ return -ENOENT; } return -EPERM; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); put_task_struct(task); return ret; } static int proc_single_open(struct inode *inode, struct file *filp) { return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { .open = proc_single_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /* * proc_mem_open() can return errno, NULL or mm_struct*. * * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) * - Returns mm_struct* on success * - Returns error code on failure */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); struct mm_struct *mm; if (!task) return ERR_PTR(-ESRCH); mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (IS_ERR(mm)) return mm == ERR_PTR(-ESRCH) ? NULL : mm; /* ensure this mm_struct can't be freed */ mmgrab(mm); /* but do not pin its memory */ mmput(mm); return mm; } static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR_OR_NULL(mm)) return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } static int mem_open(struct inode *inode, struct file *file) { if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) return -EINVAL; return __mem_open(inode, file, PTRACE_MODE_ATTACH); } static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) { struct task_struct *task; bool ptrace_active = false; switch (proc_mem_force_override) { case PROC_MEM_FORCE_NEVER: return false; case PROC_MEM_FORCE_PTRACE: task = get_proc_task(file_inode(file)); if (task) { ptrace_active = READ_ONCE(task->ptrace) && READ_ONCE(task->mm) == mm && READ_ONCE(task->parent) == current; put_task_struct(task); } return ptrace_active; default: return true; } } static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { struct mm_struct *mm = file->private_data; unsigned long addr = *ppos; ssize_t copied; char *page; unsigned int flags; if (!mm) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; copied = 0; if (!mmget_not_zero(mm)) goto free; flags = write ? FOLL_WRITE : 0; if (proc_mem_foll_force(file, mm)) flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; break; } if (!write && copy_to_user(buf, page, this_len)) { copied = -EFAULT; break; } buf += this_len; addr += this_len; copied += this_len; count -= this_len; } *ppos = addr; mmput(mm); free: free_page((unsigned long) page); return copied; } static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, buf, count, ppos, 0); } static ssize_t mem_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, (char __user*)buf, count, ppos, 1); } loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: file->f_pos = offset; break; case 1: file->f_pos += offset; break; default: return -EINVAL; } force_successful_syscall_return(); return file->f_pos; } static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, .release = mem_release, .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ); } static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *page; unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; /* Ensure the process spawned far enough to have an environment. */ if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; if (!mmget_not_zero(mm)) goto free; spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; int retval; if (src >= (env_end - env_start)) break; this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; break; } if (copy_to_user(buf, page, retval)) { ret = -EFAULT; break; } ret += retval; src += retval; buf += retval; count -= retval; } *ppos = src; mmput(mm); free: free_page((unsigned long) page); return ret; } static const struct file_operations proc_environ_operations = { .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, .release = mem_release, }; static int auxv_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); } static ssize_t auxv_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; if (!mm) return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); } static const struct file_operations proc_auxv_operations = { .open = auxv_open, .read = auxv_read, .llseek = generic_file_llseek, .release = mem_release, }; static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; if (!task) return -ESRCH; if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX; else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); if (oom_adj > OOM_ADJUST_MAX) oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); } else { if ((short)oom_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } } /* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock */ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task); if (p) { if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } task_unlock(p); } } task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; trace_oom_score_adj_update(task); if (mm) { struct task_struct *p; rcu_read_lock(); for_each_process(p) { if (same_thread_group(task, p)) continue; /* do not touch kernel threads or the global init */ if (p->flags & PF_KTHREAD || is_global_init(p)) continue; task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; } task_unlock(p); } rcu_read_unlock(); mmdrop(mm); } err_unlock: mutex_unlock(&oom_adj_mutex); put_task_struct(task); return err; } /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it. */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_adj); if (err) goto out; if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && oom_adj != OOM_DISABLE) { err = -EINVAL; goto out; } /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. */ if (oom_adj == OOM_ADJUST_MAX) oom_adj = OOM_SCORE_ADJ_MAX; else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_adj_operations = { .read = oom_adj_read, .write = oom_adj_write, .llseek = generic_file_llseek, }; static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; size_t len; if (!task) return -ESRCH; oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || oom_score_adj > OOM_SCORE_ADJ_MAX) { err = -EINVAL; goto out; } err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { .read = oom_score_adj_read, .write = oom_score_adj_write, .llseek = default_llseek, }; #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", from_kuid(file->f_cred->user_ns, audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); uid_t loginuid; kuid_t kloginuid; int rv; /* Don't let kthreads write their own loginuid */ if (current->flags & PF_KTHREAD) return -EPERM; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } rv = kstrtou32_from_user(buf, count, 10, &loginuid); if (rv < 0) return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); if (!uid_valid(kloginuid)) return -EINVAL; } rv = audit_set_loginuid(kloginuid); if (rv < 0) return rv; return count; } static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_sessionid(task)); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; if (!task) return -ESRCH; make_it_fail = task->make_it_fail; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); if (rv < 0) return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); return count; } static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; int err; unsigned int n; err = kstrtouint_from_user(buf, count, 0, &n); if (err) return err; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->fail_nth = n; put_task_struct(task); return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char numbuf[PROC_NUMBUF]; ssize_t len; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { .read = proc_fail_nth_read, .write = proc_fail_nth_write, }; #endif /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_show_task(p, ns, m); put_task_struct(p); return 0; } static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_set_task(p); put_task_struct(p); return count; } static int sched_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_show, inode); } static const struct file_operations proc_pid_sched_operations = { .open = sched_open, .read = seq_read, .write = sched_write, .llseek = seq_lseek, .release = single_release, }; #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: */ static int sched_autogroup_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_autogroup_show_task(p, m); put_task_struct(p); return 0; } static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF] = {}; int nice; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; err = kstrtoint(strstrip(buffer), 0, &nice); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; put_task_struct(p); return count; } static int sched_autogroup_open(struct inode *inode, struct file *filp) { int ret; ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; m->private = inode; } return ret; } static const struct file_operations proc_pid_sched_autogroup_operations = { .open = sched_autogroup_open, .read = seq_read, .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_SCHED_AUTOGROUP */ #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { struct task_struct *p; p = get_proc_task(file_inode(m->file)); if (!p) return -ESRCH; proc_timens_show_offsets(p, m); put_task_struct(p); return 0; } static ssize_t timens_offsets_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct proc_timens_offset offsets[2]; char *kbuf = NULL, *pos, *next_line; struct task_struct *p; int ret, noffsets; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* Parse the user data */ ret = -EINVAL; noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; clock[sizeof(clock) - 1] = 0; if (strcmp(clock, "monotonic") == 0 || strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) off->clockid = CLOCK_MONOTONIC; else if (strcmp(clock, "boottime") == 0 || strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) off->clockid = CLOCK_BOOTTIME; else goto out; noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) count = next_line - kbuf; break; } } ret = -ESRCH; p = get_proc_task(inode); if (!p) goto out; ret = proc_timens_set_offset(file, p, offsets, noffsets); put_task_struct(p); if (ret) goto out; ret = count; out: kfree(kbuf); return ret; } static int timens_offsets_open(struct inode *inode, struct file *filp) { return single_open(filp, timens_offsets_show, inode); } static const struct file_operations proc_timens_offsets_operations = { .open = timens_offsets_open, .read = seq_read, .write = timens_offsets_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_TIME_NS */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); if (!p) return -ESRCH; if (same_thread_group(current, p)) { set_task_comm(p, buffer); proc_comm_connector(p); } else count = -EINVAL; put_task_struct(p); return count; } static int comm_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_task_name(m, p, false); seq_putc(m, '\n'); put_task_struct(p); return 0; } static int comm_open(struct inode *inode, struct file *filp) { return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { .open = comm_open, .read = seq_read, .write = comm_write, .llseek = seq_lseek, .release = single_release, }; static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; exe_file = get_task_exe_file(task); put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); fput(exe_file); return 0; } else return -ENOENT; } static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = nd_jump_link(&path); out: return ERR_PTR(error); } static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: kfree(tmp); return len; } static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = do_proc_readlink(&path, buffer, buflen); path_put(&path); out: return error; } const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_pid_get_link, .setattr = proc_setattr, }; /* building an inode */ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a * proc file for a task. */ const struct cred *cred; kuid_t uid; kgid_t gid; if (unlikely(task->flags & PF_KTHREAD)) { *ruid = GLOBAL_ROOT_UID; *rgid = GLOBAL_ROOT_GID; return; } /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); uid = cred->euid; gid = cred->egid; rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories. */ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm; task_lock(task); mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns; uid = make_kuid(user_ns, 0); if (!uid_valid(uid)) uid = GLOBAL_ROOT_UID; gid = make_kgid(user_ns, 0); if (!gid_valid(gid)) gid = GLOBAL_ROOT_GID; } } else { uid = GLOBAL_ROOT_UID; gid = GLOBAL_ROOT_GID; } task_unlock(task); } *ruid = uid; *rgid = gid; } void proc_pid_evict_inode(struct proc_inode *ei) { struct pid *pid = ei->pid; if (S_ISDIR(ei->vfs_inode.i_mode)) { spin_lock(&pid->lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } } struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; struct pid *pid; /* We need a new inode */ inode = new_inode(sb); if (!inode) goto out; /* Common stuff */ ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ pid = get_task_pid(task, PIDTYPE_PID); if (!pid) goto out_unlock; /* Let the pid remember us for quick removal */ ei->pid = pid; task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: return inode; out_unlock: iput(inode); return NULL; } /* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released. */ static struct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode *inode; struct proc_inode *ei; struct pid *pid; inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL; /* Let proc_flush_pid find this directory inode */ ei = PROC_I(inode); pid = ei->pid; spin_lock(&pid->lock); hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); spin_unlock(&pid->lock); return inode; } int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, * it only makes getattr() consistent with readdir(). */ return -ENOENT; } task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; } /* dentry stuff */ /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ void pid_update_inode(struct task_struct *task, struct inode *inode) { task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); } /* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ static int pid_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; int ret = 0; rcu_read_lock(); inode = d_inode_rcu(dentry); if (!inode) goto out; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); ret = 1; } out: rcu_read_unlock(); return ret; } static inline bool proc_inode_is_dead(struct inode *inode) { return !proc_pid(inode)->tasks[PIDTYPE_PID].first; } int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, }; /* Lookups */ /* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN; ino_t ino = 1; child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); if (unlikely(res)) { dput(child); child = res; if (IS_ERR(child)) goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); end_instantiate: return dir_emit(ctx, name, len, ino, type); } /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. */ static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { const char *str = dentry->d_name.name; unsigned long long sval, eval; unsigned int len; if (str[0] == '0' && str[1] != '-') return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsigned long)sval) return -EINVAL; str += len; if (*str != '-') return -EINVAL; str++; if (str[0] == '0' && str[1]) return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsigned long)eval) return -EINVAL; str += len; if (*str != '\0') return -EINVAL; *start = sval; *end = eval; return 0; } static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; struct inode *inode; int status = 0; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); mmap_read_unlock(mm); } } mmput(mm); if (exact_vma_exists) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); status = 1; } out: put_task_struct(task); out_notask: return status; } static const struct dentry_operations tid_map_files_dentry_operations = { .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; task = get_proc_task(d_inode(dentry)); if (!task) goto out; mm = get_task_mm(task); put_task_struct(task); if (!mm) goto out; rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); if (rc) goto out_mmput; rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } mmap_read_unlock(mm); out_mmput: mmput(mm); out: return rc; } struct map_files_info { unsigned long start; unsigned long end; fmode_t mode; }; /* * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due * to concerns about how the symlinks may be used to bypass permissions on * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); } /* * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_map_files_get_link, .setattr = proc_setattr, }; static struct dentry * proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; return proc_splice_unmountable(inode, dentry, &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct dentry *result; struct mm_struct *mm; result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) goto out_put_mm; result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; if (vma->vm_file) result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(task); out: return result; } static const struct inode_operations proc_map_files_inode_operations = { .lookup = proc_map_files_lookup, .permission = proc_fd_permission, .setattr = proc_setattr, }; static int proc_map_files_readdir(struct file *file, struct dir_context *ctx) { struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; struct vma_iterator vmi; genradix_init(&fa); ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) goto out; ret = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; } nr_files = 0; /* * We need two passes here: * * 1) Collect vmas of mapped files with mmap_lock taken * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() * routine might require mmap_lock taken in might_fault(). */ pos = 2; vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) continue; p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); mmput(mm); goto out_put_task; } p->start = vma->vm_start; p->end = vma->vm_end; p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) break; ctx->pos++; } out_put_task: put_task_struct(task); out: genradix_free(&fa); return ret; } static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, .iterate_shared = proc_map_files_readdir, .llseek = generic_file_llseek, }; #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) { struct timers_private *tp = m->private; tp->task = get_pid_task(tp->pid, PIDTYPE_PID); if (!tp->task) return ERR_PTR(-ESRCH); rcu_read_lock(); return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; if (tp->task) { put_task_struct(tp->task); tp->task = NULL; rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", }; struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); struct timers_private *tp = m->private; int notify = timer->it_sigev_notify; guard(spinlock_irq)(&timer->it_lock); if (!posixtimer_valid(timer)) return 0; seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } static const struct seq_operations proc_timers_seq_ops = { .start = timers_start, .next = timers_next, .stop = timers_stop, .show = show_timer, }; static int proc_timers_open(struct inode *inode, struct file *file) { struct timers_private *tp; tp = __seq_open_private(file, &proc_timers_seq_ops, sizeof(struct timers_private)); if (!tp) return -ENOMEM; tp->pid = proc_pid(inode); tp->ns = proc_pid_ns(inode->i_sb); return 0; } static const struct file_operations proc_timers_operations = { .open = proc_timers_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; #endif static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; u64 slack_ns; int err; err = kstrtoull_from_user(buf, count, 10, &slack_ns); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); count = -EPERM; goto out; } rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { count = err; goto out; } } task_lock(p); if (rt_or_dl_task_policy(p)) slack_ns = 0; else if (slack_ns == 0) slack_ns = p->default_timer_slack_ns; p->timer_slack_ns = slack_ns; task_unlock(p); out: put_task_struct(p); return count; } static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; goto out; } rcu_read_unlock(); err = security_task_getscheduler(p); if (err) goto out; } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); out: put_task_struct(p); return err; } static int timerslack_ns_open(struct inode *inode, struct file *filp) { return single_open(filp, timerslack_ns_show, inode); } static const struct file_operations proc_pid_set_timerslack_ns_operations = { .open = timerslack_ns_open, .read = seq_read, .write = timerslack_ns_write, .llseek = seq_lseek, .release = single_release, }; static struct dentry *proc_pident_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, const struct pid_entry *p, const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { res = proc_pident_instantiate(dentry, task, p); break; } } put_task_struct(task); out_no_task: return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { struct task_struct *task = get_proc_task(file_inode(file)); const struct pid_entry *p; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= nents + 2) goto out; for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; ctx->pos++; } out: put_task_struct(task); return 0; } #ifdef CONFIG_SECURITY static int proc_pid_attr_open(struct inode *inode, struct file *file) { file->private_data = NULL; __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); return 0; } static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) length = simple_read_from_buffer(buf, count, ppos, p, length); kfree(p); return length; } static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task; void *page; int rv; /* A task may only write when it was the opener. */ if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { rcu_read_unlock(); return -ESRCH; } /* A task may only write its own attributes. */ if (current != task) { rcu_read_unlock(); return -EACCES; } /* Prevent changes to overridden credentials. */ if (current_cred() != current_real_cred()) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex); if (rv < 0) goto out_free; rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(&current->signal->cred_guard_mutex); out_free: kfree(page); out: return rv; } static const struct file_operations proc_pid_attr_operations = { .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ struct dir_context *ctx) \ { \ return proc_pident_readdir(filp, ctx, \ LSM##_attr_dir_stuff, \ ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ struct dentry *dentry, unsigned int flags) \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ .lookup = proc_##LSM##_attr_dir_lookup, \ .getattr = pid_getattr, \ .setattr = proc_setattr, \ } #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { ATTR(LSM_ID_APPARMOR, "current", 0666), ATTR(LSM_ID_APPARMOR, "prev", 0444), ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { ATTR(LSM_ID_UNDEF, "current", 0666), ATTR(LSM_ID_UNDEF, "prev", 0444), ATTR(LSM_ID_UNDEF, "exec", 0666), ATTR(LSM_ID_UNDEF, "fscreate", 0666), ATTR(LSM_ID_UNDEF, "keycreate", 0666), ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), #endif #ifdef CONFIG_SECURITY_APPARMOR DIR("apparmor", 0555, proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), #endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_attr_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { .lookup = proc_attr_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; #endif #ifdef CONFIG_ELF_CORE static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; int ret; if (!task) return -ESRCH; ret = 0; mm = get_task_mm(task); if (mm) { unsigned long flags = __mm_flags_get_dumpable(mm); len = snprintf(buffer, sizeof(buffer), "%08lx\n", ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); } put_task_struct(task); return ret; } static ssize_t proc_coredump_filter_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; struct mm_struct *mm; unsigned int val; int ret; int i; unsigned long mask; ret = kstrtouint_from_user(buf, count, 0, &val); if (ret < 0) return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; mm = get_task_mm(task); if (!mm) goto out_no_mm; ret = 0; for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); out_no_mm: put_task_struct(task); out_no_task: if (ret < 0) return ret; return count; } static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct; int result; result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; guard(rcu)(); scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { acct = sig->ioac; __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); } } else { acct = task->ioac; } seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", (unsigned long long)acct.rchar, (unsigned long long)acct.wchar, (unsigned long long)acct.syscr, (unsigned long long)acct.syscw, (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); result = 0; out_unlock: up_read(&task->signal->exec_update_lock); return result; } static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 0); } static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; struct seq_file *seq; int ret = -EINVAL; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; ret = seq_open(file, seq_ops); if (ret) goto err_put_ns; seq = file->private_data; seq->private = ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_id_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; put_user_ns(ns); return seq_release(inode, file); } static int proc_uid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_uid_seq_operations); } static int proc_gid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_gid_seq_operations); } static int proc_projid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_projid_seq_operations); } static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_gid_map_operations = { .open = proc_gid_map_open, .write = proc_gid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_projid_map_operations = { .open = proc_projid_map_open, .write = proc_projid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static int proc_setgroups_open(struct inode *inode, struct file *file) { struct user_namespace *ns = NULL; struct task_struct *task; int ret; ret = -ESRCH; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; if (file->f_mode & FMODE_WRITE) { ret = -EACCES; if (!ns_capable(ns, CAP_SYS_ADMIN)) goto err_put_ns; } ret = single_open(file, &proc_setgroups_show, ns); if (ret) goto err_put_ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_setgroups_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; int ret = single_release(inode, file); put_user_ns(ns); return ret; } static const struct file_operations proc_setgroups_operations = { .open = proc_setgroups_open, .write = proc_setgroups_write, .read = seq_read, .llseek = seq_lseek, .release = proc_setgroups_release, }; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { int err = lock_trace(task); if (!err) { seq_printf(m, "%08x\n", task->personality); unlock_trace(task); } return err; } #ifdef CONFIG_LIVEPATCH static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { seq_printf(m, "%d\n", task->patch_state); return 0; } #endif /* CONFIG_LIVEPATCH */ #ifdef CONFIG_KSM static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; mm = get_task_mm(task); if (mm) { seq_printf(m, "%lu\n", mm->ksm_merging_pages); mmput(mm); } return 0; } static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; int ret = 0; mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); seq_printf(m, "ksm_merge_any: %s\n", mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); return ret; } seq_printf(m, "ksm_mergeable: %s\n", ksm_process_mergeable(mm) ? "yes" : "no"); mmap_read_unlock(mm); mmput(mm); } return 0; } #endif /* CONFIG_KSM */ #ifdef CONFIG_KSTACK_ERASE_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long prev_depth = THREAD_SIZE - (task->prev_lowest_stack & (THREAD_SIZE - 1)); unsigned long depth = THREAD_SIZE - (task->lowest_stack & (THREAD_SIZE - 1)); seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", prev_depth, depth); return 0; } #endif /* CONFIG_KSTACK_ERASE_METRICS */ /* * Thread groups */ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif #ifdef CONFIG_TIME_NS REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_KSTACK_ERASE_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; struct pid *tgid_pidfd_to_pid(const struct file *file) { if (file->f_op != &proc_tgid_base_operations) return ERR_PTR(-EBADF); return proc_pid(file_inode(file)); } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tgid_base_stuff, tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; /** * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. * @pid: pid that should be flushed. * * This function walks a list of inodes (that belong to any proc * filesystem) that are attached to the pid and flushes them from * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be * flushed instead. This routine is provided to flush those useless * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee * that no dcache entries will exist after a process is reaped * it just makes it very unlikely that any will persist. */ void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; /* Limit procfs to only ptraceable tasks */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) goto out_put_task; } result = proc_pid_instantiate(dentry, task, NULL); out_put_task: put_task_struct(task); out: return result; } /* * Find the first task with tgid >= tgid * */ struct tgid_iter { unsigned int tgid; struct task_struct *task; }; static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) { struct pid *pid; if (iter.task) put_task_struct(iter.task); rcu_read_lock(); retry: iter.task = NULL; pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); iter.task = pid_task(pid, PIDTYPE_TGID); if (!iter.task) { iter.tgid += 1; goto retry; } get_task_struct(iter.task); } rcu_read_unlock(); return iter; } #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { struct inode *inode = d_inode(fs_info->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { struct inode *inode = d_inode(fs_info->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; unsigned int len; cond_resched(); if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); return 0; } } ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } /* * proc_tid_comm_permission is a special permission function exclusively * used for the node /proc/<pid>/task/<tid>/comm. * It bypasses generic permission checks in the case where a task of the same * task group attempts to access the node. * The rationale behind this is that glibc and bionic access this node for * cross thread naming (pthread_set/getname_np(!self)). However, if * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, * which locks out the cross thread naming implementation. * This function makes sure that the node is always accessible for members of * same thread group. */ static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; task = get_proc_task(inode); if (!task) return -ESRCH; is_same_tgroup = same_thread_group(current, task); put_task_struct(task); if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { /* This file (/proc/<pid>/task/<tid>/comm) can always be * read or written by the members of the corresponding * thread group. */ return 0; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { .setattr = proc_setattr, .permission = proc_tid_comm_permission, }; /* * Tasks */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tid_base_stuff, tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tid_base_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; if (!same_thread_group(leader, task)) goto out_drop_task; result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: return result; } /* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users * buffer was too small or there was a seek into the middle of the * directory we have more work todo. * * In the case of a short read we start with find_task_by_pid. * * In the case of a seek we start with the leader and walk nr * threads past it. */ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns) { struct task_struct *pos, *task; unsigned long nr = f_pos; if (nr != f_pos) /* 32bit overflow? */ return NULL; rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); if (!task) goto fail; /* Attempt to start with the tid of a thread */ if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ if (nr >= get_nr_threads(task)) goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ for_each_thread(task, pos) { if (!nr--) goto found; } fail: pos = NULL; goto out; found: get_task_struct(pos); out: rcu_read_unlock(); return pos; } /* * Find the next thread in the thread list. * Return NULL if there is an error or no next thread. * * The reference to the input task_struct is released. */ static struct task_struct *next_tid(struct task_struct *start) { struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { pos = __next_thread(start); if (pos) get_task_struct(pos); } rcu_read_unlock(); put_task_struct(start); return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct task_struct *task; struct pid_namespace *ns; int tid; if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) return 0; /* We cache the tgid value that the last readdir call couldn't * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); tid = (int)(intptr_t)file->private_data; file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } } return 0; } static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); put_task_struct(p); } return 0; } /* * proc_task_readdir() set @file->private_data to a positive integer * value, so casting that to u64 is safe. generic_llseek_cookie() will * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is * here to catch any unexpected change in behavior either in * proc_task_readdir() or generic_llseek_cookie(). */ static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) { u64 cookie = (u64)(intptr_t)file->private_data; loff_t off; off = generic_llseek_cookie(file, offset, whence, &cookie); WARN_ON_ONCE(cookie > INT_MAX); file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ return off; } static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) { nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); }
1237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* Copyright (c) 2002-2007 Volkswagen Group Electronic Research * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de> * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #ifndef CAN_ML_H #define CAN_ML_H #include <linux/can.h> #include <linux/list.h> #include <linux/netdevice.h> #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; struct can_dev_rcv_lists { struct hlist_head rx[RX_MAX]; struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; int entries; }; struct can_ml_priv { struct can_dev_rcv_lists dev_rcv_lists; #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) { return netdev_get_ml_priv(dev, ML_PRIV_CAN); } static inline void can_set_ml_priv(struct net_device *dev, struct can_ml_priv *ml_priv) { netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } #endif /* CAN_ML_H */
10 3 3 15 15 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; struct xdp_rxtx_ring *rxtx_ring; if (umem_queue) return struct_size(umem_ring, desc, q->nentries); return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; size_t size; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; q->nentries = nentries; q->ring_mask = nentries - 1; size = xskq_get_ring_size(q, umem_queue); /* size which is overflowing or close to SIZE_MAX will become 0 in * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous * is_power_of_2(), the rest will be handled by vmalloc_user() */ if (unlikely(size == SIZE_MAX)) { kfree(q); return NULL; } size = PAGE_ALIGN(size); q->ring = vmalloc_user(size); if (!q->ring) { kfree(q); return NULL; } q->ring_vmalloc_size = size; return q; } void xskq_destroy(struct xsk_queue *q) { if (!q) return; vfree(q->ring); kfree(q); }
19 19 19 19 19 17184 15956 17099 151 151 2191 2192 9173 9174 203 203 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", current->comm); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", current->comm); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
88 99 10 13 88 2 86 69 1 2338 545 40 1122 716 2335 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */
360 361 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 // SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { ASSERT_RTNL(); info->family = AF_INET; /* Paired with READ_ONCE() in fib4_seq_read() */ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib4_rules_dump(net, nb, extack); if (err) return err; return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { .family = AF_INET, .fib_seq_read = fib4_seq_read, .fib_dump = fib4_dump, .owner = THIS_MODULE, }; int __net_init fib4_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.fib_seq = 0; ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.notifier_ops = ops; return 0; } void __net_exit fib4_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.notifier_ops); }
302 92 9 302 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * * This file is part of the SCTP kernel implementation * * These are the state tables for the SCTP state machine. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state); static const struct sctp_sm_table_entry bug = { .fn = sctp_sf_bug, .name = "sctp_sf_bug" }; #define DO_LOOKUP(_max, _type, _table) \ ({ \ const struct sctp_sm_table_entry *rtn; \ \ if ((event_subtype._type > (_max))) { \ pr_warn("table %p possible attack: event %d exceeds max %d\n", \ _table, event_subtype._type, _max); \ rtn = &bug; \ } else \ rtn = &_table[event_subtype._type][(int)state]; \ \ rtn; \ }) const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(net, event_subtype.chunk, state); case SCTP_EVENT_T_TIMEOUT: return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, timeout_event_table); case SCTP_EVENT_T_OTHER: return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); case SCTP_EVENT_T_PRIMITIVE: return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; } } #define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} #define TYPE_SCTP_DATA { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_DATA */ #define TYPE_SCTP_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \ } /* TYPE_SCTP_INIT */ #define TYPE_SCTP_INIT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_INIT_ACK */ #define TYPE_SCTP_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SACK */ #define TYPE_SCTP_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ /* This should not happen, but we are nice. */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ } /* TYPE_SCTP_HEARTBEAT */ #define TYPE_SCTP_HEARTBEAT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_HEARTBEAT_ACK */ #define TYPE_SCTP_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_pdiscard), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \ } /* TYPE_SCTP_ABORT */ #define TYPE_SCTP_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SHUTDOWN */ #define TYPE_SCTP_SHUTDOWN_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ } /* TYPE_SCTP_SHUTDOWN_ACK */ #define TYPE_SCTP_ERROR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ERROR */ #define TYPE_SCTP_COOKIE_ECHO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ } /* TYPE_SCTP_COOKIE_ECHO */ #define TYPE_SCTP_COOKIE_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_COOKIE_ACK */ #define TYPE_SCTP_ECN_ECNE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_ECNE */ #define TYPE_SCTP_ECN_CWR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_CWR */ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_4_C), \ } /* TYPE_SCTP_SHUTDOWN_COMPLETE */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. * * For base protocol (RFC 2960). */ static const struct sctp_sm_table_entry chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_DATA, TYPE_SCTP_INIT, TYPE_SCTP_INIT_ACK, TYPE_SCTP_SACK, TYPE_SCTP_HEARTBEAT, TYPE_SCTP_HEARTBEAT_ACK, TYPE_SCTP_ABORT, TYPE_SCTP_SHUTDOWN, TYPE_SCTP_SHUTDOWN_ACK, TYPE_SCTP_ERROR, TYPE_SCTP_COOKIE_ECHO, TYPE_SCTP_COOKIE_ACK, TYPE_SCTP_ECN_ECNE, TYPE_SCTP_ECN_CWR, TYPE_SCTP_SHUTDOWN_COMPLETE, }; /* state_fn_t chunk_event_table[][] */ #define TYPE_SCTP_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF */ #define TYPE_SCTP_ASCONF_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF_ACK */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_ASCONF, TYPE_SCTP_ASCONF_ACK, }; /*state_fn_t addip_chunk_event_table[][] */ #define TYPE_SCTP_FWD_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_FWD_TSN */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FWD_TSN, }; /*state_fn_t prsctp_chunk_event_table[][] */ #define TYPE_SCTP_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_RECONF */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_RECONF, }; /*state_fn_t reconf_chunk_event_table[][] */ #define TYPE_SCTP_AUTH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ } /* TYPE_SCTP_AUTH */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_AUTH, }; /*state_fn_t auth_chunk_event_table[][] */ static const struct sctp_sm_table_entry pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), }; /* chunk pad */ static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_ootb), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), }; /* chunk unknown */ #define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ } /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ #define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ } /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ #define TYPE_SCTP_PRIMITIVE_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \ } /* TYPE_SCTP_PRIMITIVE_ABORT */ #define TYPE_SCTP_PRIMITIVE_SEND { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_SEND */ #define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ } /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ #define TYPE_SCTP_PRIMITIVE_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_ASCONF */ #define TYPE_SCTP_PRIMITIVE_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_RECONF */ /* The primary index for this table is the primitive type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_PRIMITIVE_ASSOCIATE, TYPE_SCTP_PRIMITIVE_SHUTDOWN, TYPE_SCTP_PRIMITIVE_ABORT, TYPE_SCTP_PRIMITIVE_SEND, TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, TYPE_SCTP_PRIMITIVE_ASCONF, TYPE_SCTP_PRIMITIVE_RECONF, }; #define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } #define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_OTHER_NO_PENDING_TSN, TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, }; #define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_probe), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state) { if (state > SCTP_STATE_MAX) return &bug; if (cid == SCTP_CID_I_DATA) cid = SCTP_CID_DATA; if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; switch ((u16)cid) { case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: return &prsctp_chunk_event_table[0][state]; case SCTP_CID_ASCONF: return &addip_chunk_event_table[0][state]; case SCTP_CID_ASCONF_ACK: return &addip_chunk_event_table[1][state]; case SCTP_CID_RECONF: return &reconf_chunk_event_table[0][state]; case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; case SCTP_CID_PAD: return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file define the new driver API for Wireless Extensions * * Version : 8 16.3.07 * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 2001-2007 Jean Tourrilhes, All Rights Reserved. */ #ifndef _IW_HANDLER_H #define _IW_HANDLER_H /************************** DOCUMENTATION **************************/ /* * Initial driver API (1996 -> onward) : * ----------------------------------- * The initial API just sends the IOCTL request received from user space * to the driver (via the driver ioctl handler). The driver has to * handle all the rest... * * The initial API also defines a specific handler in struct net_device * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. * However, there are a few shortcomings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). * o Driver has to mess up with copy_to/from_user, and in many cases * does it unproperly. Common mistakes are : * * buffer overflows (no checks or off by one checks) * * call copy_to/from_user with irq disabled * o The user space interface is tied to ioctl because of the use * copy_to/from_user. * * New driver API (2002 -> onward) : * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export * the list of handler it supports, and those will be called appropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). * Hopefully, I've also addressed the shortcoming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions * o Tighter checks of ioctl before calling the driver * o Flexible commit strategy (at least, the start of it) * o Backward compatibility (can be mixed with old API) * o Driver doesn't have to worry about memory and user-space issues * The last point is important for the following reasons : * o You are now able to call the new driver API from any API you * want (including from within other parts of the kernel). * o Common mistakes are avoided (buffer overflow, user space copy * with irq disabled and so on). * * The Drawback of the new API are : * o bloat (especially kernel) * o need to migrate existing drivers to new API * My initial testing shows that the new API adds around 3kB to the kernel * and save between 0 and 5kB from a typical driver. * Also, as all structures and data types are unchanged, the migration is * quite straightforward (but tedious). * * --- * * The new driver API is defined below in this file. User space should * not be aware of what's happening down there... * * A new kernel wrapper is in charge of validating the IOCTLs and calling * the appropriate driver handler. This is implemented in : * # net/core/wireless.c * * The driver export the list of handlers in : * # include/linux/netdevice.h (one place) * * The new driver API is available for WIRELESS_EXT >= 13. * Good luck with migration to the new API ;-) */ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* * Some of the choice I've made are pretty controversial. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * * Implementation goals : * -------------------- * The implementation goals were as follow : * o Obvious : you should not need a PhD to understand what's happening, * the benefit is easier maintenance. * o Flexible : it should accommodate a wide variety of driver * implementations and be as flexible as the old API. * o Lean : it should be efficient memory wise to minimise the impact * on kernel footprint. * o Transparent to user space : the large number of user space * applications that use Wireless Extensions should not need * any modifications. * * Array of functions versus Struct of functions * --------------------------------------------- * 1) Having an array of functions allow the kernel code to access the * handler in a single lookup, which is much more efficient (think hash * table here). * 2) The only drawback is that driver writer may put their handler in * the wrong slot. This is trivial to test (I set the frequency, the * bitrate changes). Once the handler is in the proper slot, it will be * there forever, because the array is only extended at the end. * 3) Backward/forward compatibility : adding new handler just require * extending the array, so you can put newer driver in older kernel * without having to patch the kernel code (and vice versa). * * All handler are of the same generic type * ---------------------------------------- * That's a feature !!! * 1) Having a generic handler allow to have generic code, which is more * efficient. If each of the handler was individually typed I would need * to add a big switch in the kernel (== more bloat). This solution is * more scalable, adding new Wireless Extensions doesn't add new code. * 2) You can use the same handler in different slots of the array. For * hardware, it may be more efficient or logical to handle multiple * Wireless Extensions with a single function, and the API allow you to * do that. (An example would be a single record on the card to control * both bitrate and frequency, the handler would read the old record, * modify it according to info->cmd and rewrite it). * * Functions prototype uses union iwreq_data * ----------------------------------------- * Some would have preferred functions defined this way : * static int mydriver_ioctl_setrate(struct net_device *dev, * long rate, int auto) * 1) The kernel code doesn't "validate" the content of iwreq_data, and * can't do it (different hardware may have different notion of what a * valid frequency is), so we don't pretend that we do it. * 2) The above form is not extendable. If I want to add a flag (for * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). * 4) I don't expect driver developer using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, * struct iw_request_info *info, * struct iw_param *rrq, * char *extra) * And then adding the handler in the array like this : * (iw_handler) mydriver_ioctl_setrate, // SIOCSIWRATE * * Using functions and not a registry * ---------------------------------- * Another implementation option would have been for every instance to * define a registry (a struct containing all the Wireless Extensions) * and only have a function to commit the registry to the hardware. * 1) This approach can be emulated by the current code, but not * vice versa. * 2) Some drivers don't keep any configuration in the driver, for them * adding such a registry would be a significant bloat. * 3) The code to translate from Wireless Extension to native format is * needed anyway, so it would not reduce significantely the amount of code. * 4) The current approach only selectively translate Wireless Extensions * to native format and only selectively set, whereas the registry approach * would require to translate all WE and set all parameters for any single * change. * 5) For many Wireless Extensions, the GET operation return the current * dynamic value, not the value that was set. * * This header is <net/iw_handler.h> * --------------------------------- * 1) This header is kernel space only and should not be exported to * user space. Headers in "include/linux/" are exported, headers in * "include/net/" are not. * * Mixed 32/64 bit issues * ---------------------- * The Wireless Extensions are designed to be 64 bit clean, by using only * datatypes with explicit storage size. * There are some issues related to kernel and user space using different * memory model, and in particular 64bit kernel with 32bit user space. * The problem is related to struct iw_point, that contains a pointer * that *may* need to be translated. * This is quite messy. The new API doesn't solve this problem (it can't), * but is a step in the right direction : * 1) Meta data about each ioctl is easily available, so we know what type * of translation is needed. * 2) The move of data between kernel and user space is only done in a single * place in the kernel, so adding specific hooks in there is possible. * 3) In the long term, it allows to move away from using ioctl as the * user space API. * * So many comments and so few code * -------------------------------- * That's a feature. Comments won't bloat the resulting kernel binary. */ /***************************** INCLUDES *****************************/ #include <linux/wireless.h> /* IOCTL user space API */ #include <linux/if_ether.h> /***************************** VERSION *****************************/ /* * This constant is used to know which version of the driver API is * available. Hopefully, this will be pretty stable and no changes * will be needed... * I just plan to increment with each new version. */ #define IW_HANDLER_VERSION 8 /* * Changes : * * V2 to V3 * -------- * - Move event definition in <linux/wireless.h> * - Add Wireless Event support : * o wireless_send_event() prototype * o iwe_stream_add_event/point() inline functions * V3 to V4 * -------- * - Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes * * V4 to V5 * -------- * - Add new spy support : struct iw_spy_data & prototypes * * V5 to V6 * -------- * - Change the way we get to spy_data method for added safety * - Remove spy #ifdef, they are always on -> cleaner code * - Add IW_DESCR_FLAG_NOMAX flag for very large requests * - Start migrating get_wireless_stats to struct iw_handler_def * * V6 to V7 * -------- * - Add struct ieee80211_device pointer in struct iw_public_data * - Remove (struct iw_point *)->pointer from events and streams * - Remove spy_offset from struct iw_handler_def * - Add "check" version of event macros for ieee802.11 stack * * V7 to V8 * ---------- * - Prevent leaking of kernel space in stream on 64 bits. */ /**************************** CONSTANTS ****************************/ /* Enhanced spy support available */ #define IW_WIRELESS_SPY #define IW_WIRELESS_THRSPY /* Special error message for the driver to indicate that we * should do a commit after return from the iw_handler */ #define EIWCOMMIT EINPROGRESS /* Flags available in struct iw_request_info */ #define IW_REQUEST_FLAG_COMPAT 0x0001 /* Compat ioctl call */ /* Type of headers we know about (basically union iwreq_data) */ #define IW_HEADER_TYPE_NULL 0 /* Not available */ #define IW_HEADER_TYPE_CHAR 2 /* char [IFNAMSIZ] */ #define IW_HEADER_TYPE_UINT 4 /* __u32 */ #define IW_HEADER_TYPE_FREQ 5 /* struct iw_freq */ #define IW_HEADER_TYPE_ADDR 6 /* struct sockaddr */ #define IW_HEADER_TYPE_POINT 8 /* struct iw_point */ #define IW_HEADER_TYPE_PARAM 9 /* struct iw_param */ #define IW_HEADER_TYPE_QUAL 10 /* struct iw_quality */ /* Handling flags */ /* Most are not implemented. I just use them as a reminder of some * cool features we might need one day ;-) */ #define IW_DESCR_FLAG_NONE 0x0000 /* Obvious */ /* Wrapper level flags */ #define IW_DESCR_FLAG_DUMP 0x0001 /* Not part of the dump command */ #define IW_DESCR_FLAG_EVENT 0x0002 /* Generate an event on SET */ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ /****************************** TYPES ******************************/ /* ----------------------- WIRELESS HANDLER ----------------------- */ /* * A wireless handler is just a standard function, that looks like the * ioctl handler. * We also define there how a handler list look like... As the Wireless * Extension space is quite dense, we use a simple array, which is faster * (that's the perfect hash table ;-). */ /* * Meta data about the request passed to the iw_handler. * Most handlers can safely ignore what's in there. * The 'cmd' field might come handy if you want to use the same handler * for multiple command... * This struct is also my long term insurance. I can add new fields here * without breaking the prototype of iw_handler... */ struct iw_request_info { __u16 cmd; /* Wireless Extension command */ __u16 flags; /* More to come ;-) */ }; struct net_device; /* * This is how a function handling a Wireless Extension should look * like (both get and set, standard and private). */ typedef int (*iw_handler)(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* * This define all the handler that the driver export. * As you need only one per driver type, please use a static const * shared by all driver instances... Same for the members... * This will be linked from net_device in <linux/netdevice.h> */ struct iw_handler_def { /* Array of handlers for standard ioctls * We will call dev->wireless_handlers->standard[ioctl - SIOCIWFIRST] */ const iw_handler * standard; /* Number of handlers defined (more precisely, index of the * last defined handler + 1) */ __u16 num_standard; #ifdef CONFIG_WEXT_PRIV __u16 num_private; /* Number of private arg description */ __u16 num_private_args; /* Array of handlers for private ioctls * Will call dev->wireless_handlers->private[ioctl - SIOCIWFIRSTPRIV] */ const iw_handler * private; /* Arguments of private handler. This one is just a list, so you * can put it in any order you want and should not leave holes... * We will automatically export that to user space... */ const struct iw_priv_args * private_args; #endif /* New location of get_wireless_stats, to de-bloat struct net_device. * The old pointer in struct net_device will be gradually phased * out, and drivers are encouraged to use this one... */ struct iw_statistics* (*get_wireless_stats)(struct net_device *dev); }; /* ---------------------- IOCTL DESCRIPTION ---------------------- */ /* * One of the main goal of the new interface is to deal entirely with * user space/kernel space memory move. * For that, we need to know : * o if iwreq is a pointer or contain the full data * o what is the size of the data to copy * * For private IOCTLs, we use the same rules as used by iwpriv and * defined in struct iw_priv_args. * * For standard IOCTLs, things are quite different and we need to * use the structures below. Actually, this struct is also more * efficient, but that's another story... */ /* * Describe how a standard IOCTL looks like. */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ }; /* Need to think of short header translation table. Later. */ /* --------------------- ENHANCED SPY SUPPORT --------------------- */ /* * In the old days, the driver was handling spy support all by itself. * Now, the driver can delegate this task to Wireless Extensions. * It needs to include this struct in its private part and use the * standard spy iw_handler. */ /* * Instance specific spy data, i.e. addresses spied and quality for them. */ struct iw_spy_data { /* --- Standard spy support --- */ int spy_number; u_char spy_address[IW_MAX_SPY][ETH_ALEN]; struct iw_quality spy_stat[IW_MAX_SPY]; /* --- Enhanced spy support (event) */ struct iw_quality spy_thr_low; /* Low threshold */ struct iw_quality spy_thr_high; /* High threshold */ u_char spy_thr_under[IW_MAX_SPY]; }; /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). * Those may be called by driver modules. */ /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); #ifdef CONFIG_WEXT_CORE /* flush all previous wext events - if work is done from netdev notifiers */ void wireless_nlevent_flush(void); #else static inline void wireless_nlevent_flush(void) {} #endif /* We may need a function to send a stream of events to user space. * More on that later... */ /************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ static inline int iwe_stream_lcp_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_LCP_LEN; #endif return IW_EV_LCP_LEN; } static inline int iwe_stream_point_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_POINT_LEN; #endif return IW_EV_POINT_LEN; } static inline int iwe_stream_event_len_adjust(struct iw_request_info *info, int event_len) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) { event_len -= IW_EV_LCP_LEN; event_len += IW_EV_COMPAT_LCP_LEN; } #endif return event_len; } /*------------------------------------------------------------------*/ /* * Wrapper to add an Wireless Event to a stream of events. */ char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len); static inline char * iwe_stream_add_event_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { char *res = iwe_stream_add_event(info, stream, ends, iwe, event_len); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add an short Wireless Event containing a pointer to a * stream of events. */ char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra); static inline char * iwe_stream_add_point_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { char *res = iwe_stream_add_point(info, stream, ends, iwe, extra); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add a value to a Wireless Event in a stream of events. * Be careful, this one is tricky to use properly : * At the first run, you need to have (value = event + IW_EV_LCP_LEN). */ char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len); #endif /* _IW_HANDLER_H */
52 52 52 52 52 52 48 1 9 52 52 1 52 52 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET6 connection oriented protocols. * * Authors: See the TCPv6 sources */ #include <linux/module.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_ecn.h> #include <net/inet_hashtables.h> #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> #include <net/sock_reuseport.h> struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) return NULL; return dst; } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { return __sk_dst_check(sk, cookie); } static struct dst_entry *inet6_csk_route_socket(struct sock *sk, struct flowi6 *fl6) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl6->flowlabel); fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) ip6_dst_store(sk, dst, false, false); } return dst; } int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; struct dst_entry *dst; int res; dst = inet6_csk_route_socket(sk, &fl6); if (IS_ERR(dst)) { WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); sk->sk_route_caps = 0; kfree_skb(skb); return PTR_ERR(dst); } rcu_read_lock(); skb_dst_set_noref(skb, dst); /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(inet6_csk_xmit); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) { struct flowi6 fl6; struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); if (IS_ERR(dst)) return NULL; dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; }
22 22 27 27 27 27 22 22 22 22 22 22 22 22 21 21 22 22 22 22 1 1 21 21 4 4 4 4 22 22 27 27 494 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include <linux/prctl.h> #include <linux/mempolicy.h> #include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { unsigned long hashmask; unsigned int hashshift; struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_hashmask (__futex_data.hashmask) #define futex_hashshift (__futex_data.hashshift) #define futex_queues (__futex_data.queues) struct futex_private_hash { int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; bool custom; struct futex_hash_bucket queues[]; }; /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH static bool futex_ref_get(struct futex_private_hash *fph); static bool futex_ref_put(struct futex_private_hash *fph); static bool futex_ref_is_dead(struct futex_private_hash *fph); enum { FR_PERCPU = 0, FR_ATOMIC }; static inline bool futex_key_is_private(union futex_key *key) { /* * Relies on get_futex_key() to set either bit for shared * futexes -- see comment with union futex_key. */ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); } static bool futex_private_hash_get(struct futex_private_hash *fph) { return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { if (futex_ref_put(fph)) wake_up_var(fph->mm); } /** * futex_hash_get - Get an additional reference for the local hash. * @hb: ptr to the private local hash. * * Obtain an additional reference for the already obtained hash bucket. The * caller must already own an reference. */ void futex_hash_get(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; WARN_ON_ONCE(!futex_private_hash_get(fph)); } void futex_hash_put(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; futex_private_hash_put(fph); } static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { u32 hash; if (!futex_key_is_private(key)) return NULL; if (!fph) fph = rcu_dereference(key->private.mm->futex_phash); if (!fph || !fph->hash_mask) return NULL; hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4, key->both.offset); return &fph->queues[hash & fph->hash_mask]; } static void futex_rehash_private(struct futex_private_hash *old, struct futex_private_hash *new) { struct futex_hash_bucket *hb_old, *hb_new; unsigned int slots = old->hash_mask + 1; unsigned int i; for (i = 0; i < slots; i++) { struct futex_q *this, *tmp; hb_old = &old->queues[i]; spin_lock(&hb_old->lock); plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { plist_del(&this->list, &hb_old->chain); futex_hb_waiters_dec(hb_old); WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); hb_new = __futex_hash(&this->key, new); futex_hb_waiters_inc(hb_new); /* * The new pointer isn't published yet but an already * moved user can be unqueued due to timeout or signal. */ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); plist_add(&this->list, &hb_new->chain); this->lock_ptr = &hb_new->lock; spin_unlock(&hb_new->lock); } spin_unlock(&hb_old->lock); } } static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new) { struct futex_private_hash *fph; WARN_ON_ONCE(mm->futex_phash_new); fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } new->state = FR_PERCPU; scoped_guard(rcu) { mm->futex_batches = get_state_synchronize_rcu(); rcu_assign_pointer(mm->futex_phash, new); } kvfree_rcu(fph, rcu); return true; } static void futex_pivot_hash(struct mm_struct *mm) { scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *fph; fph = mm->futex_phash_new; if (fph) { mm->futex_phash_new = NULL; __futex_pivot_hash(mm, fph); } } } struct futex_private_hash *futex_private_hash(void) { struct mm_struct *mm = current->mm; /* * Ideally we don't loop. If there is a replacement in progress * then a new private hash is already prepared and a reference can't be * obtained once the last user dropped it's. * In that case we block on mm_struct::futex_hash_lock and either have * to perform the replacement or wait while someone else is doing the * job. Eitherway, on the second iteration we acquire a reference on the * new private hash or loop again because a new replacement has been * requested. */ again: scoped_guard(rcu) { struct futex_private_hash *fph; fph = rcu_dereference(mm->futex_phash); if (!fph) return NULL; if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); goto again; } struct futex_hash_bucket *futex_hash(union futex_key *key) { struct futex_private_hash *fph; struct futex_hash_bucket *hb; again: scoped_guard(rcu) { hb = __futex_hash(key, NULL); fph = hb->priv; if (!fph || futex_private_hash_get(fph)) return hb; } futex_pivot_hash(key->private.mm); goto again; } #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { return NULL; } struct futex_hash_bucket *futex_hash(union futex_key *key) { return __futex_hash(key, NULL); } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ #ifdef CONFIG_FUTEX_MPOL static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct mempolicy *mpol; int node = FUTEX_NO_NODE; if (!vma) return FUTEX_NO_NODE; mpol = vma_policy(vma); if (!mpol) return FUTEX_NO_NODE; switch (mpol->mode) { case MPOL_PREFERRED: node = first_node(mpol->nodes); break; case MPOL_PREFERRED_MANY: case MPOL_BIND: if (mpol->home_node != NUMA_NO_NODE) node = mpol->home_node; break; default: break; } return node; } static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) { int seq, node; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return -EBUSY; node = __futex_key_to_node(mm, addr); if (mmap_lock_speculate_retry(mm, seq)) return -EAGAIN; return node; } static int futex_mpol(struct mm_struct *mm, unsigned long addr) { int node; node = futex_key_to_node_opt(mm, addr); if (node >= FUTEX_NO_NODE) return node; guard(mmap_read_lock)(mm); return __futex_key_to_node(mm, addr); } #else /* !CONFIG_FUTEX_MPOL */ static int futex_mpol(struct mm_struct *mm, unsigned long addr) { return FUTEX_NO_NODE; } #endif /* CONFIG_FUTEX_MPOL */ /** * __futex_hash - Return the hash bucket * @key: Pointer to the futex key for which the hash is calculated * @fph: Pointer to private hash if known * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket. * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the * private hash) is returned if existing. Otherwise a hash bucket from the * global hash is returned. */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph) { int node = key->both.node; u32 hash; if (node == FUTEX_NO_NODE) { struct futex_hash_bucket *hb; hb = __futex_hash_private(key, fph); if (hb) return hb; } hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); if (node == FUTEX_NO_NODE) { /* * In case of !FLAGS_NUMA, use some unused hash bits to pick a * node -- this ensures regular futexes are interleaved across * the nodes and avoids having to allocate multiple * hash-tables. * * NOTE: this isn't perfectly uniform, but it is fast and * handles sparse node masks. */ node = (hash >> futex_hashshift) % nr_node_ids; if (!node_possible(node)) { node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node); } } return &futex_queues[node][hash & futex_hashmask]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_setup_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; old = 0; if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int node, err, size, ro = 0; bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; size = futex_size(flags); if (flags & FLAGS_NUMA) size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; node = FUTEX_NO_NODE; if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { node = futex_mpol(mm, address); node_updated = true; } if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (node == FUTEX_NO_NODE) { node = numa_node_id(); node_updated = true; } if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } key->both.node = node; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) __acquires(&hb->lock) { /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { futex_hb_waiters_dec(hb); spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = task; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* RCU so lock_ptr is not going away during locking. */ guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } void futex_q_lockptr_lock(struct futex_q *q) { spinlock_t *lock_ptr; /* * See futex_unqueue() why lock_ptr can change. */ guard(rcu)(); retry: lock_ptr = READ_ONCE(q->lock_ptr); spin_lock(lock_ptr); if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; union futex_key key = FUTEX_KEY_INIT; /* * The mutex mm_struct::futex_hash_lock might be acquired. */ might_sleep(); /* * Ensure the hash remains stable (no resize) during the while loop * below. The hb pointer is acquired under the pi_lock so we can't block * on the mutex. */ WARN_ON(curr != current); guard(private_hash)(); /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; if (1) { CLASS(hb, hb)(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); } rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, struct futex_private_hash *fph) { #ifdef CONFIG_FUTEX_PRIVATE_HASH fhb->priv = fph; #endif atomic_set(&fhb->waiters, 0); plist_head_init(&fhb->chain); spin_lock_init(&fhb->lock); } #define FH_CUSTOM 0x01 #ifdef CONFIG_FUTEX_PRIVATE_HASH /* * futex-ref * * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that * code because it just doesn't fit right. * * Dual counter, per-cpu / atomic approach like percpu-refcount, except it * re-initializes the state automatically, such that the fph swizzle is also a * transition back to per-cpu. */ static void futex_ref_rcu(struct rcu_head *head); static void __futex_ref_atomic_begin(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * The counter we're about to switch to must have fully switched; * otherwise it would be impossible for it to have reported success * from futex_ref_is_dead(). */ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); /* * Set the atomic to the bias value such that futex_ref_{get,put}() * will never observe 0. Will be fixed up in __futex_ref_atomic_end() * when folding in the percpu count. */ atomic_long_set(&mm->futex_atomic, LONG_MAX); smp_store_release(&fph->state, FR_ATOMIC); call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static void __futex_ref_atomic_end(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; unsigned int count = 0; long ret; int cpu; /* * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC * and per this RCU callback, everybody must now observe this state and * use the atomic variable. */ WARN_ON_ONCE(fph->state != FR_ATOMIC); /* * Therefore the per-cpu counter is now stable, sum and reset. */ for_each_possible_cpu(cpu) { unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); count += *ptr; *ptr = 0; } /* * Re-init for the next cycle. */ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ /* * Add actual count, subtract bias and initial refcount. * * The moment this atomic operation happens, futex_ref_is_dead() can * become true. */ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); if (!ret) wake_up_var(mm); WARN_ON_ONCE(ret < 0); mmput_async(mm); } static void futex_ref_rcu(struct rcu_head *head) { struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); if (fph->state == FR_PERCPU) { /* * Per this extra grace-period, everybody must now observe * fph as the current fph and no previously observed fph's * are in-flight. * * Notably, nobody will now rely on the atomic * futex_ref_is_dead() state anymore so we can begin the * migration of the per-cpu counter into the atomic. */ __futex_ref_atomic_begin(fph); return; } __futex_ref_atomic_end(fph); } /* * Drop the initial refcount and transition to atomics. */ static void futex_ref_drop(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * Can only transition the current fph; */ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); /* * We enqueue at least one RCU callback. Ensure mm stays if the task * exits before the transition is completed. */ mmget(mm); /* * In order to avoid the following scenario: * * futex_hash() __futex_pivot_hash() * guard(rcu); guard(mm->futex_hash_lock); * fph = mm->futex_phash; * rcu_assign_pointer(&mm->futex_phash, new); * futex_hash_allocate() * futex_ref_drop() * fph->state = FR_ATOMIC; * atomic_set(, BIAS); * * futex_private_hash_get(fph); // OOPS * * Where an old fph (which is FR_ATOMIC) and should fail on * inc_not_zero, will succeed because a new transition is started and * the atomic is bias'ed away from 0. * * There must be at least one full grace-period between publishing a * new fph and trying to replace it. */ if (poll_state_synchronize_rcu(mm->futex_batches)) { /* * There was a grace-period, we can begin now. */ __futex_ref_atomic_begin(fph); return; } call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_inc(*mm->futex_ref); return true; } return atomic_long_inc_not_zero(&mm->futex_atomic); } static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_dec(*mm->futex_ref); return false; } return atomic_long_dec_and_test(&mm->futex_atomic); } static bool futex_ref_is_dead(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(rcu)(); if (smp_load_acquire(&fph->state) == FR_PERCPU) return false; return atomic_long_read(&mm->futex_atomic) == 0; } int futex_mm_init(struct mm_struct *mm) { mutex_init(&mm->futex_hash_lock); RCU_INIT_POINTER(mm->futex_phash, NULL); mm->futex_phash_new = NULL; /* futex-ref */ mm->futex_ref = NULL; atomic_long_set(&mm->futex_atomic, 0); mm->futex_batches = get_state_synchronize_rcu(); return 0; } void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); if (fph) kvfree(fph); } static bool futex_pivot_pending(struct mm_struct *mm) { struct futex_private_hash *fph; guard(rcu)(); if (!mm->futex_phash_new) return true; fph = rcu_dereference(mm->futex_phash); return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, struct futex_private_hash *b) { /* user provided always wins */ if (!a->custom && b->custom) return true; if (a->custom && !b->custom) return false; /* zero-sized hash wins */ if (!b->hash_mask) return true; if (!a->hash_mask) return false; /* keep the biggest */ if (a->hash_mask < b->hash_mask) return true; if (a->hash_mask > b->hash_mask) return false; return false; /* equal */ } static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { struct mm_struct *mm = current->mm; struct futex_private_hash *fph; bool custom = flags & FH_CUSTOM; int i; if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) return -EINVAL; /* * Once we've disabled the global hash there is no way back. */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; } } if (!mm->futex_ref) { /* * This will always be allocated by the first thread and * therefore requires no locking. */ mm->futex_ref = alloc_percpu(unsigned int); if (!mm->futex_ref) return -ENOMEM; this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ } fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; fph->mm = mm; for (i = 0; i < hash_slots; i++) futex_hash_bucket_init(&fph->queues[i], fph); if (custom) { /* * Only let prctl() wait / retry; don't unduly delay clone(). */ again: wait_var_event(mm, futex_pivot_pending(mm)); } scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *free __free(kvfree) = NULL; struct futex_private_hash *cur, *new; cur = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); new = mm->futex_phash_new; mm->futex_phash_new = NULL; if (fph) { if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, * the second one returns here. */ free = fph; mm->futex_phash_new = new; return -EBUSY; } if (cur && !new) { /* * If we have an existing hash, but do not yet have * allocated a replacement hash, drop the initial * reference on the existing hash. */ futex_ref_drop(cur); } if (new) { /* * Two updates raced; throw out the lesser one. */ if (futex_hash_less(new, fph)) { free = new; new = fph; } else { free = fph; } } else { new = fph; } fph = NULL; } if (new) { /* * Will set mm->futex_phash_new on failure; * futex_private_hash_get() will try again. */ if (!__futex_pivot_hash(mm, new) && custom) goto again; } } return 0; } int futex_hash_allocate_default(void) { unsigned int threads, buckets, current_buckets = 0; struct futex_private_hash *fph; if (!current->mm) return 0; scoped_guard(rcu) { threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus()); fph = rcu_dereference(current->mm->futex_phash); if (fph) { if (fph->custom) return 0; current_buckets = fph->hash_mask + 1; } } /* * The default allocation will remain within * 16 <= threads * 4 <= global hash size */ buckets = roundup_pow_of_two(4 * threads); buckets = clamp(buckets, 16, futex_hashmask + 1); if (current_buckets >= buckets) return 0; return futex_hash_allocate(buckets, 0); } static int futex_hash_get_slots(void) { struct futex_private_hash *fph; guard(rcu)(); fph = rcu_dereference(current->mm->futex_phash); if (fph && fph->hash_mask) return fph->hash_mask + 1; return 0; } #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { return -EINVAL; } static int futex_hash_get_slots(void) { return 0; } #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) { unsigned int flags = FH_CUSTOM; int ret; switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: if (arg4) return -EINVAL; ret = futex_hash_allocate(arg3, flags); break; case PR_FUTEX_HASH_GET_SLOTS: ret = futex_hash_get_slots(); break; default: ret = -EINVAL; break; } return ret; } static int __init futex_init(void) { unsigned long hashsize, i; unsigned int order, n; unsigned long size; #ifdef CONFIG_BASE_SMALL hashsize = 16; #else hashsize = 256 * num_possible_cpus(); hashsize /= num_possible_nodes(); hashsize = max(4, hashsize); hashsize = roundup_pow_of_two(hashsize); #endif futex_hashshift = ilog2(hashsize); size = sizeof(struct futex_hash_bucket) * hashsize; order = get_order(size); for_each_node(n) { struct futex_hash_bucket *table; if (order > MAX_PAGE_ORDER) table = vmalloc_huge_node(size, GFP_KERNEL, n); else table = alloc_pages_exact_nid(n, size, GFP_KERNEL); BUG_ON(!table); for (i = 0; i < hashsize; i++) futex_hash_bucket_init(&table[i], NULL); futex_queues[n] = table; } futex_hashmask = hashsize - 1; pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init);
47 48 36 46 62 63 55 41 41 47 47 47 47 29 38 38 38 38 8 8 8 8 8 17 17 17 17 2 180 180 14 15 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dst_cache.c - dst entry cache * * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> */ #include <linux/kernel.h> #include <linux/percpu.h> #include <net/dst_cache.h> #include <net/route.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_fib.h> #endif #include <uapi/linux/in.h> struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; struct in6_addr in6_saddr; }; }; static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); dst_cache->cookie = cookie; dst_cache->dst = dst; } static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, struct dst_cache_pcpu *idst) { struct dst_entry *dst; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; } return dst; fail: idst->refresh_ts = jiffies; return NULL; } struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) { local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; } *saddr = idst->in_saddr.s_addr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, __be32 saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, const struct in6_addr *saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, struct in6_addr *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) { local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; } *saddr = idst->in6_saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); #endif int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { unsigned int i; dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; for_each_possible_cpu(i) local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; } EXPORT_SYMBOL_GPL(dst_cache_init); void dst_cache_destroy(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; for_each_possible_cpu(i) dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); void dst_cache_reset_now(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; idst->cookie = 0; idst->dst = NULL; dst_release(dst); } } EXPORT_SYMBOL_GPL(dst_cache_reset_now);
50 44 8 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright (C) 2022, 2024 Intel Corporation */ #ifndef IEEE80211_RATE_H #define IEEE80211_RATE_H #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" struct rate_control_ref { const struct rate_control_ops *ops; void *priv; }; void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc); void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st); void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct sta_info *sta, gfp_t gfp) { spin_lock_init(&sta->rate_ctrl_lock); return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) { struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; ref->ops->free_sta(ref->priv, ista, priv_sta); } static inline void rate_control_add_sta_debugfs(struct sta_info *sta) { #ifdef CONFIG_MAC80211_DEBUGFS struct rate_control_ref *ref = sta->rate_ctrl; if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs) ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, sta->debugfs_dir); #endif } extern const struct debugfs_short_fops rcname_ops; static inline void rate_control_add_debugfs(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfsdir; if (!local->rate_ctrl) return; if (!local->rate_ctrl->ops->add_debugfs) return; debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); local->debugfs.rcdir = debugfsdir; debugfs_create_file("name", 0400, debugfsdir, local->rate_ctrl, &rcname_ops); local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv, debugfsdir); #endif } void ieee80211_check_rate_mask(struct ieee80211_link_data *link); /* Get a reference to the rate control algorithm. If `name' is NULL, get the * first available algorithm. */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name); void rate_control_deinitialize(struct ieee80211_local *local); /* Rate control algorithms */ #ifdef CONFIG_MAC80211_RC_MINSTREL int rc80211_minstrel_init(void); void rc80211_minstrel_exit(void); #else static inline int rc80211_minstrel_init(void) { return 0; } static inline void rc80211_minstrel_exit(void) { } #endif #endif /* IEEE80211_RATE_H */
1681 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H #include <linux/types.h> /* 31 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 31 struct folio; /** * struct folio_batch - A collection of folios. * * The folio_batch is used to amortise the cost of retrieving and * operating on a set of folios. The order of folios in the batch may be * significant (eg delete_from_page_cache_batch()). Some users of the * folio_batch store "exceptional" entries in it which can be removed * by calling folio_batch_remove_exceptionals(). */ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. * * A freshly initialised folio_batch contains zero folios. */ static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; } static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } /** * folio_batch_add() - Add a folio to a batch. * @fbatch: The folio batch. * @folio: The folio to add. * * The folio is added to the end of the batch. * The batch must have previously been initialised using folio_batch_init(). * * Return: The number of slots still available. */ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; return folio_batch_space(fbatch); } /** * folio_batch_next - Return the next folio to process. * @fbatch: The folio batch being processed. * * Use this function to implement a queue of folios. * * Return: The next folio in the queue, or NULL if the queue is empty. */ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) { if (fbatch->i == fbatch->nr) return NULL; return fbatch->folios[fbatch->i++]; } void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { if (folio_batch_count(fbatch)) __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation */ #include <linux/unaligned.h> void eir_create(struct hci_dev *hdev, u8 *data); u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size); u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len); u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len); u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data, u8 data_len); static inline u16 eir_precalc_len(u8 data_len) { return sizeof(u8) * 2 + data_len; } static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) { eir[eir_len++] = sizeof(type) + data_len; eir[eir_len++] = type; memcpy(&eir[eir_len], data, data_len); eir_len += data_len; return eir_len; } static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) { eir[eir_len++] = sizeof(type) + sizeof(data); eir[eir_len++] = type; put_unaligned_le16(data, &eir[eir_len]); eir_len += sizeof(data); return eir_len; } static inline u16 eir_skb_put_data(struct sk_buff *skb, u8 type, u8 *data, u8 data_len) { u8 *eir; u16 eir_len; eir_len = eir_precalc_len(data_len); eir = skb_put(skb, eir_len); WARN_ON(sizeof(type) + data_len > U8_MAX); eir[0] = sizeof(type) + data_len; eir[1] = type; memcpy(&eir[2], data, data_len); return eir_len; } static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, size_t *data_len) { size_t parsed = 0; if (eir_len < 2) return NULL; while (parsed < eir_len - 1) { u8 field_len = eir[0]; if (field_len == 0) break; parsed += field_len + 1; if (parsed > eir_len) break; if (eir[1] != type) { eir += field_len + 1; continue; } /* Zero length data */ if (field_len == 1) return NULL; if (data_len) *data_len = field_len - 1; return &eir[2]; } return NULL; } void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len);
303 17 27 27 27 14 14 14 14 27 27 27 11 17 27 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_PED_H #define __NET_TC_PED_H #include <net/act_api.h> #include <linux/tc_act/tc_pedit.h> #include <linux/types.h> struct tcf_pedit_key_ex { enum pedit_header_type htype; enum pedit_cmd cmd; }; struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; int action; u32 tcfp_off_max_hint; unsigned char tcfp_nkeys; unsigned char tcfp_flags; struct rcu_head rcu; }; struct tcf_pedit { struct tc_action common; struct tcf_pedit_parms __rcu *parms; }; #define to_pedit(a) ((struct tcf_pedit *)a) #define to_pedit_parms(a) (rcu_dereference(to_pedit(a)->parms)) static inline bool is_tcf_pedit(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_PEDIT) return true; #endif return false; } static inline int tcf_pedit_nkeys(const struct tc_action *a) { struct tcf_pedit_parms *parms; int nkeys; rcu_read_lock(); parms = to_pedit_parms(a); nkeys = parms->tcfp_nkeys; rcu_read_unlock(); return nkeys; } static inline u32 tcf_pedit_htype(const struct tc_action *a, int index) { u32 htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; struct tcf_pedit_parms *parms; rcu_read_lock(); parms = to_pedit_parms(a); if (parms->tcfp_keys_ex) htype = parms->tcfp_keys_ex[index].htype; rcu_read_unlock(); return htype; } static inline u32 tcf_pedit_cmd(const struct tc_action *a, int index) { struct tcf_pedit_parms *parms; u32 cmd = __PEDIT_CMD_MAX; rcu_read_lock(); parms = to_pedit_parms(a); if (parms->tcfp_keys_ex) cmd = parms->tcfp_keys_ex[index].cmd; rcu_read_unlock(); return cmd; } static inline u32 tcf_pedit_mask(const struct tc_action *a, int index) { struct tcf_pedit_parms *parms; u32 mask; rcu_read_lock(); parms = to_pedit_parms(a); mask = parms->tcfp_keys[index].mask; rcu_read_unlock(); return mask; } static inline u32 tcf_pedit_val(const struct tc_action *a, int index) { struct tcf_pedit_parms *parms; u32 val; rcu_read_lock(); parms = to_pedit_parms(a); val = parms->tcfp_keys[index].val; rcu_read_unlock(); return val; } static inline u32 tcf_pedit_offset(const struct tc_action *a, int index) { struct tcf_pedit_parms *parms; u32 off; rcu_read_lock(); parms = to_pedit_parms(a); off = parms->tcfp_keys[index].off; rcu_read_unlock(); return off; } #endif /* __NET_TC_PED_H */
2217 10 2214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/proc_ns.h> #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc init_binfmt_misc = { .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), .enabled = true, .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), }; EXPORT_SYMBOL_GPL(init_binfmt_misc); #endif /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .gid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .projid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif #if IS_ENABLED(CONFIG_BINFMT_MISC) .binfmt_misc = &init_binfmt_misc, #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init);
2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match connection tracking information. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_state.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); static bool state_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct) statebit = XT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) statebit = XT_STATE_UNTRACKED; else statebit = XT_STATE_INVALID; return (sinfo->statemask & statebit); } static int state_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void state_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match state_mt_reg __read_mostly = { .name = "state", .family = NFPROTO_UNSPEC, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, .matchsize = sizeof(struct xt_state_info), .me = THIS_MODULE, }; static int __init state_mt_init(void) { return xt_register_match(&state_mt_reg); } static void __exit state_mt_exit(void) { xt_unregister_match(&state_mt_reg); } module_init(state_mt_init); module_exit(state_mt_exit);
58 59 58 3 51 51 30 14 25 1 1 56 56 34 30 22 5 27 27 5 5 1 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem parameter parser. * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/namei.h> #include "internal.h" const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, { "no", false }, { "true", true }, { "yes", true }, { }, }; EXPORT_SYMBOL(bool_names); static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) { for ( ; tbl->name; tbl++) if (strcmp(name, tbl->name) == 0) return tbl; return NULL; } /** * lookup_constant - Look up a constant by name in an ordered table * @tbl: The table of constants to search. * @name: The name to look up. * @not_found: The value to return if the name is not found. */ int lookup_constant(const struct constant_table *tbl, const char *name, int not_found) { const struct constant_table *p = __lookup_constant(tbl, name); return p ? p->value : not_found; } EXPORT_SYMBOL(lookup_constant); static inline bool is_flag(const struct fs_parameter_spec *p) { return p->type == NULL; } static const struct fs_parameter_spec *fs_lookup_key( const struct fs_parameter_spec *desc, struct fs_parameter *param, bool *negated) { const struct fs_parameter_spec *p, *other = NULL; const char *name = param->key; bool want_flag = param->type == fs_value_is_flag; *negated = false; for (p = desc; p->name; p++) { if (strcmp(p->name, name) != 0) continue; if (likely(is_flag(p) == want_flag)) return p; other = p; } if (want_flag) { if (name[0] == 'n' && name[1] == 'o' && name[2]) { for (p = desc; p->name; p++) { if (strcmp(p->name, name + 2) != 0) continue; if (!(p->flags & fs_param_neg_with_no)) continue; *negated = true; return p; } } } return other; } /* * __fs_parse - Parse a filesystem configuration parameter * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse * * Parse a filesystem configuration parameter and attempt a conversion for a * simple parameter for which this is requested. If successful, the determined * parameter ID is placed into @result->key, the desired type is indicated in * @result->t and any converted value is placed into an appropriate member of * the union in @result. * * The function returns the parameter number if the parameter was matched, * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that * unknown parameters are okay and -EINVAL if there was a conversion issue or * the parameter wasn't recognised and unknowns aren't okay. */ int __fs_parse(struct p_log *log, const struct fs_parameter_spec *desc, struct fs_parameter *param, struct fs_parse_result *result) { const struct fs_parameter_spec *p; result->uint_64 = 0; p = fs_lookup_key(desc, param, &result->negated); if (!p) return -ENOPARAM; if (p->flags & fs_param_deprecated) warn_plog(log, "Deprecated parameter '%s'", param->key); /* Try to turn the type we were given into the type desired by the * parameter and give an error if we can't. */ if (is_flag(p)) { if (param->type != fs_value_is_flag) return inval_plog(log, "Unexpected value for '%s'", param->key); result->boolean = !result->negated; } else { int ret = p->type(log, p, param, result); if (ret) return ret; } return p->opt; } EXPORT_SYMBOL(__fs_parse); /** * fs_lookup_param - Look up a path referred to by a parameter * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, unsigned int flags, struct path *_path) { struct filename *f; bool put_f; int ret; switch (param->type) { case fs_value_is_string: f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: f = param->name; put_f = false; break; default: return invalf(fc, "%s: not usable as path", param->key); } ret = filename_lookup(param->dirfd, f, flags, _path, NULL); if (ret < 0) { errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); goto out; } if (want_bdev && !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { path_put(_path); _path->dentry = NULL; _path->mnt = NULL; errorf(fc, "%s: Non-blockdev passed as '%s'", param->key, f->name); ret = -ENOTBLK; } out: if (put_f) putname(f); return ret; } EXPORT_SYMBOL(fs_lookup_param); static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int b; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; b = lookup_constant(bool_names, param->string, -1); if (b == -1) return fs_param_bad_value(log, param); result->boolean = b; return 0; } EXPORT_SYMBOL(fs_param_is_bool); int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int base = (unsigned long)p->data; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtouint(param->string, base, &result->uint_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u32); int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoint(param->string, 0, &result->int_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_s32); int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoull(param->string, 0, &result->uint_64) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u64); int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { const struct constant_table *c; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; c = __lookup_constant(p->data, param->string); if (!c) return fs_param_bad_value(log, param); result->uint_32 = c->value; return 0; } EXPORT_SYMBOL(fs_param_is_enum); int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string || (!*param->string && !(p->flags & fs_param_can_be_empty))) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_string); int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_blob) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_blob); int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: if ((!*param->string && !(p->flags & fs_param_can_be_empty)) || kstrtouint(param->string, 0, &result->uint_32) < 0) break; if (result->uint_32 <= INT_MAX) return 0; break; case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_fd); int fs_param_is_file_or_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: return fs_param_is_string(log, p, param, result); case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_file_or_string); int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kuid_t uid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); uid = make_kuid(current_user_ns(), result->uint_32); if (!uid_valid(uid)) return inval_plog(log, "Invalid uid '%s'", param->string); result->uid = uid; return 0; } EXPORT_SYMBOL(fs_param_is_uid); int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kgid_t gid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); gid = make_kgid(current_user_ns(), result->uint_32); if (!gid_valid(gid)) return inval_plog(log, "Invalid gid '%s'", param->string); result->gid = gid; return 0; } EXPORT_SYMBOL(fs_param_is_gid); int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { return 0; } EXPORT_SYMBOL(fs_param_is_blockdev); int fs_param_is_path(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { return 0; } EXPORT_SYMBOL(fs_param_is_path); #ifdef CONFIG_VALIDATE_FS_PARSER /** * fs_validate_description - Validate a parameter specification array * @name: Owner name of the parameter specification array * @desc: The parameter specification array to validate. */ bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) { const struct fs_parameter_spec *param, *p2; bool good = true; for (param = desc; param->name; param++) { /* Check for duplicate parameter names */ for (p2 = desc; p2 < param; p2++) { if (strcmp(param->name, p2->name) == 0) { if (is_flag(param) != is_flag(p2)) continue; pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", name, param->name); good = false; } } } return good; } #endif /* CONFIG_VALIDATE_FS_PARSER */
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* * netfilter module to limit the number of parallel tcp * connections per IP address. * (c) 2000 Gerd Knorr <kraxel@bytesex.org> * Nov 2002: Martin Bene <martin.bene@icomedias.com>: * only ignore TIME_WAIT or gone connections * (C) CC Computer Consultants GmbH, 2007 * * based on ... * * Kernel module to match connection tracking information. * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connlimit.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_count.h> static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; u32 key[5]; ct = nf_ct_get(skb, &ctinfo); if (ct) zone = nf_ct_zone(ct); if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); union nf_inet_addr addr; unsigned int i; memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? &iph->daddr : &iph->saddr, sizeof(addr.ip6)); for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) addr.ip6[i] &= info->mask.ip6[i]; memcpy(key, &addr, sizeof(addr.ip6)); key[4] = zone->id; } else { const struct iphdr *iph = ip_hdr(skb); key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? (__force __u32)iph->daddr : (__force __u32)iph->saddr; key[0] &= (__force __u32)info->mask.ip; key[1] = zone->id; } connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); hotdrop: par->hotdrop = true; return false; } static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) keylen += sizeof(struct in6_addr); else keylen += sizeof(struct in_addr); ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } /* init private data */ info->data = nf_conncount_init(par->net, keylen); if (IS_ERR(info->data)) nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; nf_conncount_destroy(par->net, info->data); nf_ct_netns_put(par->net, par->family); } static struct xt_match connlimit_mt_reg[] __read_mostly = { { .name = "connlimit", .revision = 1, .family = NFPROTO_IPV4, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "connlimit", .revision = 1, .family = NFPROTO_IPV6, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, #endif }; static int __init connlimit_mt_init(void) { return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); module_exit(connlimit_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Number of connections matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connlimit"); MODULE_ALIAS("ip6t_connlimit");
8 3 5 5 5 9 9 2 7 6 1 3 3 7 7 5 9 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 // SPDX-License-Identifier: GPL-2.0-or-later /* * CCM: Counter with CBC-MAC * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> struct ccm_instance_ctx { struct crypto_skcipher_spawn ctr; struct crypto_ahash_spawn mac; }; struct crypto_ccm_ctx { struct crypto_ahash *mac; struct crypto_skcipher *ctr; }; struct crypto_rfc4309_ctx { struct crypto_aead *child; u8 nonce[3]; }; struct crypto_rfc4309_req_ctx { struct scatterlist src[3]; struct scatterlist dst[3]; struct aead_request subreq; }; struct crypto_ccm_req_priv_ctx { u8 odata[16]; u8 idata[16]; u8 auth_tag[16]; u32 flags; struct scatterlist src[3]; struct scatterlist dst[3]; union { struct ahash_request ahreq; struct skcipher_request skreq; }; }; struct cbcmac_tfm_ctx { struct crypto_cipher *child; }; static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx( struct aead_request *req) { unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); } static int set_msg_len(u8 *block, unsigned int msglen, int csize) { __be32 data; memset(block, 0, csize); block += csize; if (csize >= 4) csize = 4; else if (msglen > (1 << (8 * csize))) return -EOVERFLOW; data = cpu_to_be32(msglen); memcpy(block - csize, (u8 *)&data + 4 - csize, csize); return 0; } static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_skcipher *ctr = ctx->ctr; struct crypto_ahash *mac = ctx->mac; int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); if (err) return err; crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_ahash_setkey(mac, key, keylen); } static int crypto_ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 6: case 8: case 10: case 12: case 14: case 16: break; default: return -EINVAL; } return 0; } static int format_input(u8 *info, struct aead_request *req, unsigned int cryptlen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int lp = req->iv[0]; unsigned int l = lp + 1; unsigned int m; m = crypto_aead_authsize(aead); memcpy(info, req->iv, 16); /* format control info per RFC 3610 and * NIST Special Publication 800-38C */ *info |= (8 * ((m - 2) / 2)); if (req->assoclen) *info |= 64; return set_msg_len(info + 16 - l, cryptlen, l); } static int format_adata(u8 *adata, unsigned int a) { int len = 0; /* add control info for associated data * RFC 3610 and NIST Special Publication 800-38C */ if (a < 65280) { *(__be16 *)adata = cpu_to_be16(a); len = 2; } else { *(__be16 *)adata = cpu_to_be16(0xfffe); *(__be32 *)&adata[2] = cpu_to_be32(a); len = 6; } return len; } static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain, unsigned int cryptlen) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct ahash_request *ahreq = &pctx->ahreq; unsigned int assoclen = req->assoclen; struct scatterlist sg[3]; u8 *odata = pctx->odata; u8 *idata = pctx->idata; int ilen, err; /* format control data for input */ err = format_input(odata, req, cryptlen); if (err) goto out; sg_init_table(sg, 3); sg_set_buf(&sg[0], odata, 16); /* format associated data and compute into mac */ if (assoclen) { ilen = format_adata(idata, assoclen); sg_set_buf(&sg[1], idata, ilen); sg_chain(sg, 3, req->src); } else { ilen = 0; sg_chain(sg, 2, req->src); } ahash_request_set_tfm(ahreq, ctx->mac); ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL); ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16); err = crypto_ahash_init(ahreq); if (err) goto out; err = crypto_ahash_update(ahreq); if (err) goto out; /* we need to pad the MAC input to a round multiple of the block size */ ilen = 16 - (assoclen + ilen) % 16; if (ilen < 16) { memset(idata, 0, ilen); sg_init_table(sg, 2); sg_set_buf(&sg[0], idata, ilen); if (plain) sg_chain(sg, 2, plain); plain = sg; cryptlen += ilen; } ahash_request_set_crypt(ahreq, plain, odata, cryptlen); err = crypto_ahash_finup(ahreq); out: return err; } static void crypto_ccm_encrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); u8 *odata = pctx->odata; if (!err) scatterwalk_map_and_copy(odata, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); aead_request_complete(req, err); } static inline int crypto_ccm_check_iv(const u8 *iv) { /* 2 <= L <= 8, so 1 <= L' <= 7. */ if (1 > iv[0] || iv[0] > 7) return -EINVAL; return 0; } static int crypto_ccm_init_crypt(struct aead_request *req, u8 *tag) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct scatterlist *sg; u8 *iv = req->iv; int err; err = crypto_ccm_check_iv(iv); if (err) return err; pctx->flags = aead_request_flags(req); /* Note: rfc 3610 and NIST 800-38C require counter of * zero to encrypt auth tag. */ memset(iv + 15 - iv[0], 0, iv[0] + 1); sg_init_table(pctx->src, 3); sg_set_buf(pctx->src, tag, 16); sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen); if (sg != pctx->src + 1) sg_chain(pctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(pctx->dst, 3); sg_set_buf(pctx->dst, tag, 16); sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen); if (sg != pctx->dst + 1) sg_chain(pctx->dst, 2, sg); } return 0; } static int crypto_ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int cryptlen = req->cryptlen; u8 *odata = pctx->odata; u8 *iv = req->iv; int err; err = crypto_ccm_init_crypt(req, odata); if (err) return err; err = crypto_ccm_auth(req, sg_next(pctx->src), cryptlen); if (err) return err; dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_encrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; /* copy authtag to end of dst */ scatterwalk_map_and_copy(odata, sg_next(dst), cryptlen, crypto_aead_authsize(aead), 1); return err; } static void crypto_ccm_decrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen - authsize; struct scatterlist *dst; pctx->flags = 0; dst = sg_next(req->src == req->dst ? pctx->src : pctx->dst); if (!err) { err = crypto_ccm_auth(req, dst, cryptlen); if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize)) err = -EBADMSG; } aead_request_complete(req, err); } static int crypto_ccm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen; u8 *authtag = pctx->auth_tag; u8 *odata = pctx->odata; u8 *iv = pctx->idata; int err; cryptlen -= authsize; err = crypto_ccm_init_crypt(req, authtag); if (err) return err; scatterwalk_map_and_copy(authtag, sg_next(pctx->src), cryptlen, authsize, 0); dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; memcpy(iv, req->iv, 16); skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_decrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_decrypt(skreq); if (err) return err; err = crypto_ccm_auth(req, sg_next(dst), cryptlen); if (err) return err; /* verify */ if (crypto_memneq(authtag, odata, authsize)) return -EBADMSG; return err; } static int crypto_ccm_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct ccm_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *mac; struct crypto_skcipher *ctr; unsigned long align; int err; mac = crypto_spawn_ahash(&ictx->mac); if (IS_ERR(mac)) return PTR_ERR(mac); ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_mac; ctx->mac = mac; ctx->ctr = ctr; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + sizeof(struct crypto_ccm_req_priv_ctx) + max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr))); return 0; err_free_mac: crypto_free_ahash(mac); return err; } static void crypto_ccm_exit_tfm(struct crypto_aead *tfm) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->mac); crypto_free_skcipher(ctx->ctr); } static void crypto_ccm_free(struct aead_instance *inst) { struct ccm_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_ahash(&ctx->mac); crypto_drop_skcipher(&ctx->ctr); kfree(inst); } static int crypto_ccm_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *ctr_name, const char *mac_name) { struct skcipher_alg_common *ctr; u32 mask; struct aead_instance *inst; struct ccm_instance_ctx *ictx; struct hash_alg_common *mac; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); if (!inst) return -ENOMEM; ictx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ictx->mac, aead_crypto_instance(inst), mac_name, 0, mask | CRYPTO_ALG_ASYNC); if (err) goto err_free_inst; mac = crypto_spawn_ahash_alg(&ictx->mac); err = -EINVAL; if (strncmp(mac->base.cra_name, "cbcmac(", 7) != 0 || mac->digestsize != 16) goto err_free_inst; err = crypto_grab_skcipher(&ictx->ctr, aead_crypto_instance(inst), ctr_name, 0, mask); if (err) goto err_free_inst; ctr = crypto_spawn_skcipher_alg_common(&ictx->ctr); /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */ err = -EINVAL; if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 || ctr->ivsize != 16 || ctr->base.cra_blocksize != 1) goto err_free_inst; /* ctr and cbcmac must use the same underlying block cipher. */ if (strcmp(ctr->base.cra_name + 4, mac->base.cra_name + 7) != 0) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "ccm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "ccm_base(%s,%s)", ctr->base.cra_driver_name, mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (mac->base.cra_priority + ctr->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = ctr->base.cra_alignmask; inst->alg.ivsize = 16; inst->alg.chunksize = ctr->chunksize; inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_ccm_ctx); inst->alg.init = crypto_ccm_init_tfm; inst->alg.exit = crypto_ccm_exit_tfm; inst->alg.setkey = crypto_ccm_setkey; inst->alg.setauthsize = crypto_ccm_setauthsize; inst->alg.encrypt = crypto_ccm_encrypt; inst->alg.decrypt = crypto_ccm_decrypt; inst->free = crypto_ccm_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_ccm_free(inst); } return err; } static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *cipher_name; char ctr_name[CRYPTO_MAX_ALG_NAME]; char mac_name[CRYPTO_MAX_ALG_NAME]; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_ccm_base_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *ctr_name; const char *mac_name; ctr_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(ctr_name)) return PTR_ERR(ctr_name); mac_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(mac_name)) return PTR_ERR(mac_name); return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 3) return -EINVAL; keylen -= 3; memcpy(ctx->nonce, key + keylen, 3); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4309_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return crypto_aead_setauthsize(ctx->child, authsize); } static struct aead_request *crypto_rfc4309_crypt(struct aead_request *req) { struct crypto_rfc4309_req_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->subreq; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(aead); struct crypto_aead *child = ctx->child; struct scatterlist *sg; u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child), crypto_aead_alignmask(child) + 1); /* L' */ iv[0] = 3; memcpy(iv + 1, ctx->nonce, 3); memcpy(iv + 4, req->iv, 8); scatterwalk_map_and_copy(iv + 16, req->src, 0, req->assoclen - 8, 0); sg_init_table(rctx->src, 3); sg_set_buf(rctx->src, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen); if (sg != rctx->src + 1) sg_chain(rctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(rctx->dst, 3); sg_set_buf(rctx->dst, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen); if (sg != rctx->dst + 1) sg_chain(rctx->dst, 2, sg); } aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, rctx->src, req->src == req->dst ? rctx->src : rctx->dst, req->cryptlen, iv); aead_request_set_ad(subreq, req->assoclen - 8); return subreq; } static int crypto_rfc4309_encrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_encrypt(req); } static int crypto_rfc4309_decrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_decrypt(req); } static int crypto_rfc4309_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_aead_spawn *spawn = aead_instance_ctx(inst); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4309_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + 32); return 0; } static void crypto_rfc4309_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4309_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } static int crypto_rfc4309_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct crypto_aead_spawn *spawn; struct aead_alg *alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; /* We only support 16-byte blocks. */ if (crypto_aead_alg_ivsize(alg) != 16) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = 8; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4309_ctx); inst->alg.init = crypto_rfc4309_init_tfm; inst->alg.exit = crypto_rfc4309_exit_tfm; inst->alg.setkey = crypto_rfc4309_setkey; inst->alg.setauthsize = crypto_rfc4309_setauthsize; inst->alg.encrypt = crypto_rfc4309_encrypt; inst->alg.decrypt = crypto_rfc4309_decrypt; inst->free = crypto_rfc4309_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4309_free(inst); } return err; } static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent); return crypto_cipher_setkey(ctx->child, inkey, keylen); } static int crypto_cbcmac_digest_init(struct shash_desc *pdesc) { int bs = crypto_shash_digestsize(pdesc->tfm); u8 *dg = shash_desc_ctx(pdesc); memset(dg, 0, bs); return 0; } static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); u8 *dg = shash_desc_ctx(pdesc); do { crypto_xor(dg, p, bs); crypto_cipher_encrypt_one(tfm, dg, dg); p += bs; len -= bs; } while (len >= bs); return len; } static int crypto_cbcmac_digest_finup(struct shash_desc *pdesc, const u8 *src, unsigned int len, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); u8 *dg = shash_desc_ctx(pdesc); if (len) { crypto_xor(dg, src, len); crypto_cipher_encrypt_one(tfm, out, dg); return 0; } memcpy(out, dg, bs); return 0; } static int cbcmac_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void cbcmac_exit_tfm(struct crypto_tfm *tfm) { struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = alg->cra_blocksize; inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY; inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx); inst->alg.base.cra_init = cbcmac_init_tfm; inst->alg.base.cra_exit = cbcmac_exit_tfm; inst->alg.init = crypto_cbcmac_digest_init; inst->alg.update = crypto_cbcmac_digest_update; inst->alg.finup = crypto_cbcmac_digest_finup; inst->alg.setkey = crypto_cbcmac_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_ccm_tmpls[] = { { .name = "cbcmac", .create = cbcmac_create, .module = THIS_MODULE, }, { .name = "ccm_base", .create = crypto_ccm_base_create, .module = THIS_MODULE, }, { .name = "ccm", .create = crypto_ccm_create, .module = THIS_MODULE, }, { .name = "rfc4309", .create = crypto_rfc4309_create, .module = THIS_MODULE, }, }; static int __init crypto_ccm_module_init(void) { return crypto_register_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } static void __exit crypto_ccm_module_exit(void) { crypto_unregister_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } module_init(crypto_ccm_module_init); module_exit(crypto_ccm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Counter with CBC MAC"); MODULE_ALIAS_CRYPTO("ccm_base"); MODULE_ALIAS_CRYPTO("rfc4309"); MODULE_ALIAS_CRYPTO("ccm"); MODULE_ALIAS_CRYPTO("cbcmac"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
1056 27 29 945 234 44 1030 1043 941 301 1030 173 677 682 560 562 325 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP virtual clock driver * * Copyright 2021 NXP */ #include <linux/slab.h> #include <linux/hashtable.h> #include "ptp_private.h" #define PTP_VCLOCK_CC_SHIFT 31 #define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) #define PTP_VCLOCK_FADJ_SHIFT 9 #define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL #define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) /* protects vclock_hash addition/deletion */ static DEFINE_SPINLOCK(vclock_hash_lock); static DEFINE_READ_MOSTLY_HASHTABLE(vclock_hash, 8); static void ptp_vclock_hash_add(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_add_head_rcu(&vclock->vclock_hash_node, &vclock_hash[vclock->clock->index % HASH_SIZE(vclock_hash)]); spin_unlock(&vclock_hash_lock); } static void ptp_vclock_hash_del(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_del_init_rcu(&vclock->vclock_hash_node); spin_unlock(&vclock_hash_lock); synchronize_rcu(); } static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_adjtime(&vclock->tc, delta); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_read(&vclock->tc); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; int err; u64 ns; err = pptp->info->getcyclesx64(pptp->info, &pts, sts); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; int err; u64 ns; err = pptp->info->getcrosscycles(pptp->info, xtstamp); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); return 0; } static long ptp_vclock_refresh(struct ptp_clock_info *ptp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct timespec64 ts; ptp_vclock_gettime(&vclock->info, &ts); return PTP_VCLOCK_REFRESH_INTERVAL; } static void ptp_vclock_set_subclass(struct ptp_clock *ptp) { lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); } static const struct ptp_clock_info ptp_vclock_info = { .owner = THIS_MODULE, .name = "ptp virtual clock", .max_adj = 500000000, .adjfine = ptp_vclock_adjfine, .adjtime = ptp_vclock_adjtime, .settime64 = ptp_vclock_settime, .do_aux_work = ptp_vclock_refresh, }; static u64 ptp_vclock_read(struct cyclecounter *cc) { struct ptp_vclock *vclock = cc_to_vclock(cc); struct ptp_clock *ptp = vclock->pclock; struct timespec64 ts = {}; ptp->info->getcycles64(ptp->info, &ts); return timespec64_to_ns(&ts); } static const struct cyclecounter ptp_vclock_cc = { .read = ptp_vclock_read, .mask = CYCLECOUNTER_MASK(32), .mult = PTP_VCLOCK_CC_MULT, .shift = PTP_VCLOCK_CC_SHIFT, }; struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) { struct ptp_vclock *vclock; vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); if (!vclock) return NULL; vclock->pclock = pclock; vclock->info = ptp_vclock_info; if (pclock->info->getcyclesx64) vclock->info.gettimex64 = ptp_vclock_gettimex; else vclock->info.gettime64 = ptp_vclock_gettime; if (pclock->info->getcrosscycles) vclock->info.getcrosststamp = ptp_vclock_getcrosststamp; vclock->cc = ptp_vclock_cc; snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", pclock->index); INIT_HLIST_NODE(&vclock->vclock_hash_node); mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { kfree(vclock); return NULL; } ptp_vclock_set_subclass(vclock->clock); timecounter_init(&vclock->tc, &vclock->cc, 0); ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); ptp_vclock_hash_add(vclock); return vclock; } void ptp_vclock_unregister(struct ptp_vclock *vclock) { ptp_vclock_hash_del(vclock); ptp_clock_unregister(vclock->clock); kfree(vclock); } #if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { char name[PTP_CLOCK_NAME_LEN] = ""; struct ptp_clock *ptp; struct device *dev; int num = 0; if (pclock_index < 0) return num; snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); dev = class_find_device_by_name(&ptp_class, name); if (!dev) return num; ptp = dev_get_drvdata(dev); if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { put_device(dev); return num; } *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); if (!(*vclock_index)) goto out; memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); num = ptp->n_vclocks; out: mutex_unlock(&ptp->n_vclocks_mux); put_device(dev); return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; u64 ns; u64 vclock_ns = 0; ns = ktime_to_ns(*hwtstamp); rcu_read_lock(); hlist_for_each_entry_rcu(vclock, &vclock_hash[hash], vclock_hash_node) { if (vclock->clock->index != vclock_index) continue; if (mutex_lock_interruptible(&vclock->lock)) break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); mutex_unlock(&vclock->lock); break; } rcu_read_unlock(); return ns_to_ktime(vclock_ns); } EXPORT_SYMBOL(ptp_convert_timestamp); #endif
1750 1750 1751 6849 1751 1749 1750 1751 1751 1750 1750 1751 1750 1751 1751 1750 1751 1750 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 // SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: * Arnaldo Carvalho de Melo <acme@redhat.com> * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/irqflags.h> #include <linux/debugfs.h> #include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/sort.h> #include <linux/io.h> /* vmap_page_range() */ #include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { tracing_selftest_disabled = true; pr_info("Ftrace startup test is disabled due to %s\n", reason); } } #else #define tracing_selftest_running 0 #define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } }; static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event * occurred. */ DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets * this back to zero. */ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; #define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for * capturing traces that lead to crashes and outputing it to a * serial console. * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance * Multiple instance dump is also supported, and instances are seperated * by commas. */ /* Set to string format zero to disable by default */ char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ static int __disable_trace_on_warning; int tracepoint_printk_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); static const struct ctl_table trace_sysctl_table[] = { { .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = MAX_TRACER_SIZE, .mode = 0644, .proc_handler = proc_dostring, }, { .procname = "traceoff_on_warning", .data = &__disable_trace_on_warning, .maxlen = sizeof(__disable_trace_on_warning), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tracepoint_printk", .data = &tracepoint_printk, .maxlen = sizeof(tracepoint_printk), .mode = 0644, .proc_handler = tracepoint_printk_sysctl, }, }; static int __init init_trace_sysctls(void) { register_sysctl_init("kernel", trace_sysctl_table); return 0; } subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; }; union trace_eval_map_item; struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; struct trace_eval_map_head head; struct trace_eval_map_tail tail; }; static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); int ftrace_dump_on_oops_enabled(void) { if (!strcmp("0", ftrace_dump_on_oops)) return 0; else return 1; } static int __init set_ftrace_dump_on_oops(char *str) { if (!*str) { strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } if (*str == ',') { strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); return 1; } if (*str++ == '=') { strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); return 1; } return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) __disable_trace_on_warning = 1; return 1; } __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { char *slot = boot_snapshot_info + boot_snapshot_index; int left = sizeof(boot_snapshot_info) - boot_snapshot_index; int ret; if (str[0] == '=') { str++; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_snapshot_index += ret; } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ trace_set_ring_buffer_expanded(NULL); } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); static int __init boot_snapshot(char *str) { snapshot_at_boot = true; boot_alloc_snapshot(str); return 1; } __setup("ftrace_boot_snapshot", boot_snapshot); static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; int left = sizeof(boot_instance_info) - boot_instance_index; int ret; if (strlen(str) >= left) return -1; ret = snprintf(slot, left, "%s\t", str); boot_instance_index += ret; return 1; } __setup("trace_instance=", boot_instance); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { /* Ignore the "tp_printk_stop_on_boot" param */ if (*str == '_') return 0; if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; } __setup("tp_printk", set_tracepoint_printk); static int __init set_tracepoint_printk_stop(char *str) { tracepoint_printk_stop_on_boot = true; return 1; } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); static int __init set_traceoff_after_boot(char *str) { traceoff_after_boot = true; return 1; } __setup("traceoff_after_boot", set_traceoff_after_boot); unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); return nsec; } static void trace_process_export(struct trace_export *export, struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; if (export->flags & flag) { entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); export->write(export, entry, size); } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); static inline void ftrace_exports_enable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_inc(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_inc(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_inc(&trace_marker_exports_enabled); } static inline void ftrace_exports_disable(struct trace_export *export) { if (export->flags & TRACE_EXPORT_FUNCTION) static_branch_dec(&trace_function_exports_enabled); if (export->flags & TRACE_EXPORT_EVENT) static_branch_dec(&trace_event_exports_enabled); if (export->flags & TRACE_EXPORT_MARKER) static_branch_dec(&trace_marker_exports_enabled); } static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } } static inline void add_trace_export(struct trace_export **list, struct trace_export *export) { rcu_assign_pointer(export->next, *list); /* * We are entering export into the list but another * CPU might be walking that list. We need to make sure * the export->next pointer is valid before another CPU sees * the export pointer included into the list. */ rcu_assign_pointer(*list, export); } static inline int rm_trace_export(struct trace_export **list, struct trace_export *export) { struct trace_export **p; for (p = list; *p != NULL; p = &(*p)->next) if (*p == export) break; if (*p != export) return -1; rcu_assign_pointer(*p, (*p)->next); return 0; } static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { ftrace_exports_enable(export); add_trace_export(list, export); } static inline int rm_ftrace_export(struct trace_export **list, struct trace_export *export) { int ret; ret = rm_trace_export(list, export); ftrace_exports_disable(export); return ret; } int register_ftrace_export(struct trace_export *export) { if (WARN_ON_ONCE(!export->write)) return -1; guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { guard(mutex)(&ftrace_export_lock); return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ TRACE_ITER_COPY_MARKER) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ TRACE_ITER_COPY_MARKER) /* * The global_trace is the descriptor that holds the top-level tracing * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; static struct trace_array *printk_trace = &global_trace; /* List of trace_arrays interested in the top level trace_marker */ static LIST_HEAD(marker_copies); static __always_inline bool printk_binsafe(struct trace_array *tr) { /* * The binary format of traceprintk can cause a crash if used * by a buffer from another boot. Force the use of the * non binary version of trace_printk if the trace_printk * buffer is a boot mapped ring buffer. */ return !(tr->flags & TRACE_ARRAY_FL_BOOT); } static void update_printk_trace(struct trace_array *tr) { if (printk_trace == tr) return; printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; printk_trace = tr; tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; } /* Returns true if the status of tr changed */ static bool update_marker_trace(struct trace_array *tr, int enabled) { lockdep_assert_held(&event_mutex); if (enabled) { if (!list_empty(&tr->marker_list)) return false; list_add_rcu(&tr->marker_list, &marker_copies); tr->trace_flags |= TRACE_ITER_COPY_MARKER; return true; } if (list_empty(&tr->marker_list)) return false; list_del_init(&tr->marker_list); tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; return true; } void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) tr = &global_trace; tr->ring_buffer_expanded = true; } LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; return 0; } } return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; } /** * trace_array_put - Decrement the reference counter for this trace array. * @this_tr : pointer to the trace array * * NOTE: Use this when we no longer need the trace array returned by * trace_array_get_by_name(). This ensures the trace array can be later * destroyed. * */ void trace_array_put(struct trace_array *this_tr) { if (!this_tr) return; guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); } EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; if (tracing_disabled) return -ENODEV; if (tr && trace_array_get(tr) < 0) return -ENODEV; return 0; } /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { return trace_pid_list_is_set(filtered_pids, search_pid); } /** * trace_ignore_this_task - should a task be ignored for tracing * @filtered_pids: The list of pids to check * @filtered_no_pids: The list of pids not to be traced * @task: The task that should be ignored if not filtered * * Checks if @task should be traced or not from @filtered_pids. * Returns true if @task should *NOT* be traced. * Returns false if @task should be traced. */ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task) { /* * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids * within filtered_pids. */ return (filtered_pids && !trace_find_filtered_pid(filtered_pids, task->pid)) || (filtered_no_pids && trace_find_filtered_pid(filtered_no_pids, task->pid)); } /** * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove * * If adding a task, if @self is defined, the task is only added if @self * is also included in @pid_list. This happens on fork and tasks should * only be added when the parent is listed. If @self is NULL, then the * @task pid will be removed from the list, which would happen on exit * of a task. */ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { if (!trace_find_filtered_pid(pid_list, self->pid)) return; } /* "self" is set for forks, and NULL for exits */ if (self) trace_pid_list_set(pid_list, task->pid); else trace_pid_list_clear(pid_list, task->pid); } /** * trace_pid_next - Used for seq_file to get to the next pid of a pid_list * @pid_list: The pid list to show * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) * @pos: The position of the file * * This is used by the seq_file "next" operation to iterate the pids * listed in a trace_pid_list structure. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { long pid = (unsigned long)v; unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ if (trace_pid_list_next(pid_list, pid, &next) < 0) return NULL; pid = next; /* Return pid + 1 to allow zero to be represented */ return (void *)(pid + 1); } /** * trace_pid_start - Used for seq_file to start reading pid lists * @pid_list: The pid list to show * @pos: The position of the file * * This is used by seq_file "start" operation to start the iteration * of listing pids. * * Returns the pid+1 as we want to display pid of zero, but NULL would * stop the iteration. */ void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; unsigned int first; loff_t l = 0; if (trace_pid_list_first(pid_list, &first) < 0) return NULL; pid = first; /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) ; return (void *)pid; } /** * trace_pid_show - show the current pid in seq_file processing * @m: The seq_file structure to write into * @v: A void pointer of the pid (+1) value to display * * Can be directly used by seq_file operations to display the current * pid value. */ int trace_pid_show(struct seq_file *m, void *v) { unsigned long pid = (unsigned long)v - 1; seq_printf(m, "%lu\n", pid); return 0; } /* 128 should be much more than enough */ #define PID_BUF_SIZE 127 int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt) { struct trace_pid_list *pid_list; struct trace_parser parser; unsigned long val; int nr_pids = 0; ssize_t read = 0; ssize_t ret; loff_t pos; pid_t pid; if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) return -ENOMEM; /* * Always recreate a new array. The write is an all or nothing * operation. Always create a new array when adding new pids by * the user. If the operation fails, then the current list is * not modified. */ pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } if (filtered_pids) { /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { ret = trace_pid_list_set(pid_list, pid); if (ret < 0) goto out; ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } ret = 0; while (cnt > 0) { pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; if (!trace_parser_loaded(&parser)) break; ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; pid = (pid_t)val; if (trace_pid_list_set(pid_list, pid) < 0) { ret = -1; break; } nr_pids++; trace_parser_clear(&parser); ret = 0; } out: trace_parser_put(&parser); if (ret < 0) { trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); pid_list = NULL; } *new_pid_list = pid_list; return read; } static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ if (!buf->buffer) return trace_clock_local(); ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } u64 ftrace_now(int cpu) { return buffer_ftrace_now(&global_trace.array_buffer, cpu); } /** * tracing_is_enabled - Show if global_trace has been enabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for * the irqsoff tracer. But it may be inaccurate due to races. If you * need to know the accurate state, use tracing_is_on() which is a little * slower, but accurate. */ int tracing_is_enabled(void) { /* * For quick access (irqsoff uses this in fast path), just * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ return !global_trace.buffer_disabled; } /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded * to page size. * * This number is purposely set to a low number of 16384. * If the dump on oops happens, it will be much appreciated * to not have to wait for all that output. Anyway this can be * boot time and run time configurable. */ #define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer * * ring buffer serializes readers, but it is low level protection. * The validity of the events (which returns by ring_buffer_peek() ..etc) * are not protected by ring buffer. * * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. * * These primitives allow multi process access to different cpu ring buffer * concurrently. * * These primitives don't distinguish read-only and read-consume access. * Multi read-only access are also serialized. */ #ifdef CONFIG_SMP static DECLARE_RWSEM(all_cpu_access_lock); static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ mutex_lock(&per_cpu(cpu_access_lock, cpu)); } } static inline void trace_access_unlock(int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); up_read(&all_cpu_access_lock); } } static inline void trace_access_lock_init(void) { int cpu; for_each_possible_cpu(cpu) mutex_init(&per_cpu(cpu_access_lock, cpu)); } #else static DEFINE_MUTEX(access_lock); static inline void trace_access_lock(int cpu) { (void)cpu; mutex_lock(&access_lock); } static inline void trace_access_unlock(int cpu) { (void)cpu; mutex_unlock(&access_lock); } static inline void trace_access_lock_init(void) { } #endif #ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); #else static inline void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned long trace_ctx, int skip, struct pt_regs *regs) { } #endif static __always_inline void trace_event_setup(struct ring_buffer_event *event, int type, unsigned int trace_ctx) { struct trace_entry *ent = ring_buffer_event_data(event); tracing_generic_entry_update(ent, type, trace_ctx); } static __always_inline struct ring_buffer_event * __trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) trace_event_setup(event, type, trace_ctx); return event; } void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 0; } /** * tracing_on - enable tracing buffers * * This function enables tracing buffers that may have been * disabled with tracing_off. */ void tracing_on(void) { tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { /* Length is in event->array[0] */ ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); /* ring_buffer_unlock_commit() enables preemption */ preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer); } int __trace_array_puts(struct trace_array *tr, unsigned long ip, const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct print_entry *entry; unsigned int trace_ctx; int alloc; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running && tr == &global_trace)) return 0; if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, str, size); /* Add a newline if necessary */ if (entry->buf[size - 1] != '\n') { entry->buf[size] = '\n'; entry->buf[size + 1] = '\0'; } else entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write * @size: The size of the string. */ int __trace_puts(unsigned long ip, const char *str, int size) { return __trace_array_puts(printk_trace, ip, str, size); } EXPORT_SYMBOL_GPL(__trace_puts); /** * __trace_bputs - write the pointer to a constant string into trace buffer * @ip: The address of the caller * @str: The constant string to write to the buffer to */ int __trace_bputs(unsigned long ip, const char *str) { struct trace_array *tr = READ_ONCE(printk_trace); struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) return 0; entry = ring_buffer_event_data(event); entry->ip = ip; entry->str = str; __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT static void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); trace_array_puts(tr, "*** stopping trace here! ***\n"); tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } if (tr->mapped) { trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } void tracing_snapshot_instance(struct trace_array *tr) { tracing_snapshot_instance_cond(tr, NULL); } /** * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. */ void tracing_snapshot(void) { struct trace_array *tr = &global_trace; tracing_snapshot_instance(tr); } EXPORT_SYMBOL_GPL(tracing_snapshot); /** * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. * @tr: The tracing instance to snapshot * @cond_data: The data to be tested conditionally, and possibly saved * * This is the same as tracing_snapshot() except that the snapshot is * conditional - the snapshot will only happen if the * cond_snapshot.update() implementation receiving the cond_data * returns true, which means that the trace array's cond_snapshot * update() operation used the cond_data to determine whether the * snapshot should be taken, and if it was, presumably saved it along * with the snapshot. */ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { tracing_snapshot_instance_cond(tr, cond_data); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using * tracing_snapshot_cond_enable(), the user-defined cond_data is saved * with the snapshot. This accessor is used to retrieve it. * * Should not be called from cond_snapshot.update(), since it takes * the tr->max_lock lock, which the code calling * cond_snapshot.update() has already done. * * Returns the cond_data associated with the trace array's snapshot. */ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); local_irq_enable(); return cond_data; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { int order; int ret; if (!tr->allocated_snapshot) { /* Make the snapshot buffer have the same order as main buffer */ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret < 0) return ret; /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; tr->allocated_snapshot = true; } return 0; } static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); tr->allocated_snapshot = false; } static int tracing_arm_snapshot_locked(struct trace_array *tr) { int ret; lockdep_assert_held(&trace_types_lock); spin_lock(&tr->snapshot_trigger_lock); if (tr->snapshot == UINT_MAX || tr->mapped) { spin_unlock(&tr->snapshot_trigger_lock); return -EBUSY; } tr->snapshot++; spin_unlock(&tr->snapshot_trigger_lock); ret = tracing_alloc_snapshot_instance(tr); if (ret) { spin_lock(&tr->snapshot_trigger_lock); tr->snapshot--; spin_unlock(&tr->snapshot_trigger_lock); } return ret; } int tracing_arm_snapshot(struct trace_array *tr) { guard(mutex)(&trace_types_lock); return tracing_arm_snapshot_locked(tr); } void tracing_disarm_snapshot(struct trace_array *tr) { spin_lock(&tr->snapshot_trigger_lock); if (!WARN_ON(!tr->snapshot)) tr->snapshot--; spin_unlock(&tr->snapshot_trigger_lock); } /** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already * allocated - it doesn't also take a snapshot. * * This is meant to be used in cases where the snapshot buffer needs * to be set up for events that can't sleep but need to be able to * trigger a snapshot. */ int tracing_alloc_snapshot(void) { struct trace_array *tr = &global_trace; int ret; ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live * trace when some condition is triggered, but continue to trace. */ void tracing_snapshot_alloc(void) { int ret; ret = tracing_alloc_snapshot(); if (ret < 0) return; tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); /** * tracing_snapshot_cond_enable - enable conditional snapshot for an instance * @tr: The tracing instance * @cond_data: User data to associate with the snapshot * @update: Implementation of the cond_snapshot update function * * Check whether the conditional snapshot for the given instance has * already been enabled, or if the current tracer is already using a * snapshot; if so, return -EBUSY, else create a cond_snapshot and * save the cond_data and update function inside. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { struct cond_snapshot *cond_snapshot __free(kfree) = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); int ret; if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; guard(mutex)(&trace_types_lock); if (tr->current_trace->use_max_tr) return -EBUSY; /* * The cond_snapshot can only change to NULL without the * trace_types_lock. We don't care if we race with it going * to NULL, but we want to make sure that it's not set to * something other than NULL when we get here, which we can * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ if (tr->cond_snapshot) return -EBUSY; ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = no_free_ptr(cond_snapshot); arch_spin_unlock(&tr->max_lock); local_irq_enable(); return 0; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); /** * tracing_snapshot_cond_disable - disable conditional snapshot for an instance * @tr: The tracing instance * * Check whether the conditional snapshot for the given instance is * enabled; if so, free the cond_snapshot associated with it, * otherwise return -EINVAL. * * Returns 0 if successful, error otherwise. */ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) ret = -EINVAL; else { kfree(tr->cond_snapshot); tr->cond_snapshot = NULL; } arch_spin_unlock(&tr->max_lock); local_irq_enable(); tracing_disarm_snapshot(tr); return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) { WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); void *tracing_cond_snapshot_data(struct trace_array *tr) { return NULL; } EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { return -ENODEV; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); int tracing_snapshot_cond_disable(struct trace_array *tr) { return false; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) #define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) { if (tr->array_buffer.buffer) ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to * know if the ring buffer has been disabled, but it can handle * races of where it gets disabled but we still do a record. * As the check is in the fast path of the tracers, it is more * important to be fast than accurate. */ tr->buffer_disabled = 1; } /** * tracer_tracing_disable() - temporary disable the buffer from write * @tr: The trace array to disable its buffer for * * Expects trace_tracing_enable() to re-enable tracing. * The difference between this and tracer_tracing_off() is that this * is a counter and can nest, whereas, tracer_tracing_off() can * be called multiple times and a single trace_tracing_on() will * enable it. */ void tracer_tracing_disable(struct trace_array *tr) { if (WARN_ON_ONCE(!tr->array_buffer.buffer)) return; ring_buffer_record_disable(tr->array_buffer.buffer); } /** * tracer_tracing_enable() - counter part of tracer_tracing_disable() * @tr: The trace array that had tracer_tracincg_disable() called on it * * This is called after tracer_tracing_disable() has been called on @tr, * when it's safe to re-enable tracing. */ void tracer_tracing_enable(struct trace_array *tr) { if (WARN_ON_ONCE(!tr->array_buffer.buffer)) return; ring_buffer_record_enable(tr->array_buffer.buffer); } /** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. * It does not disable any overhead the tracers themselves may * be causing. This function simply causes all recording to * the ring buffers to fail. */ void tracing_off(void) { tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); } } /** * tracer_tracing_is_on - show real state of ring buffer enabled * @tr : the trace array to know if ring buffer is enabled * * Shows real state of the ring buffer if it is enabled or not. */ bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) return ring_buffer_record_is_set_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); static int __init set_buf_size(char *str) { unsigned long buf_size; if (!str) return 0; buf_size = memparse(str, &str); /* * nr_entries can not be zero and the startup * tests require some buffer space. Therefore * ensure we have at least 4096 bytes of buffer. */ trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { unsigned long threshold; int ret; if (!str) return 0; ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; } /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b /* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL }; static struct { u64 (*func)(void); const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; bool trace_clock_in_ns(struct trace_array *tr) { if (trace_clocks[tr->clock_id].in_ns) return true; return false; } /* * trace_parser_get_init - gets the buffer for trace parser */ int trace_parser_get_init(struct trace_parser *parser, int size) { memset(parser, 0, sizeof(*parser)); parser->buffer = kmalloc(size, GFP_KERNEL); if (!parser->buffer) return 1; parser->size = size; return 0; } /* * trace_parser_put - frees the buffer for trace parser */ void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); parser->buffer = NULL; } /* * trace_get_user - reads the user input string separated by space * (matched by isspace(ch)) * * For each string found the 'struct trace_parser' is updated, * and the function returns. * * Returns number of bytes read. * * See kernel/trace/trace.h for 'struct trace_parser' details. */ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos) { char ch; size_t read = 0; ssize_t ret; if (!*ppos) trace_parser_clear(parser); ret = get_user(ch, ubuf++); if (ret) goto fail; read++; cnt--; /* * The parser is not finished with the last write, * continue reading the user input without skipping spaces. */ if (!parser->cont) { /* skip white space */ while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) goto fail; read++; cnt--; } parser->idx = 0; /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; return read; } } /* read the non-space input */ while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; goto fail; } ret = get_user(ch, ubuf++); if (ret) goto fail; read++; cnt--; } /* We either got finished input or we have to wait for another call. */ if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto fail; } *ppos += read; return read; fail: trace_parser_fail(parser); return ret; } /* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; if (trace_seq_used(s) <= s->readpos) return -EBUSY; len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; } unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops; #ifdef LATENCY_FS_NOTIFY static struct workqueue_struct *fsnotify_wq; static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) { struct trace_array *tr = container_of(iwork, struct trace_array, fsnotify_irqwork); queue_work(fsnotify_wq, &tr->fsnotify_work); } static void trace_create_maxlat_file(struct trace_array *tr, struct dentry *d_tracer) { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, d_tracer, tr, &tracing_max_lat_fops); } __init static int latency_fsnotify_init(void) { fsnotify_wq = alloc_workqueue("tr_max_lat_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!fsnotify_wq) { pr_err("Unable to allocate tr_max_lat_wq\n"); return -ENOMEM; } return 0; } late_initcall_sync(latency_fsnotify_init); void latency_fsnotify(struct trace_array *tr) { if (!fsnotify_wq) return; /* * We cannot call queue_work(&tr->fsnotify_work) from here because it's * possible that we are called from __schedule() or do_idle(), which * could cause a deadlock. */ irq_work_queue(&tr->fsnotify_irqwork); } #else /* !LATENCY_FS_NOTIFY */ #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ d_tracer, tr, &tracing_max_lat_fops) #endif /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct array_buffer *trace_buf = &tr->array_buffer; struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use * RCU. The irq tracer can be called out of RCU scope. */ if (tsk == current) max_data->uid = current_uid(); else max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; /* record this tasks comm */ tracing_record_cmdline(tsk); latency_fsnotify(tr); } /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data) { if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); /* Inherit the recordable setting from array_buffer */ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) ring_buffer_record_on(tr->max_buffer.buffer); else ring_buffer_record_off(tr->max_buffer.buffer); #ifdef CONFIG_TRACER_SNAPSHOT if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { arch_spin_unlock(&tr->max_lock); return; } #endif swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); /* Any waiters on the old snapshot buffer need to wake up */ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** * update_max_tr_single - only copy one trace over, and reset the rest * @tr: tracer * @tsk: task with the latency * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* * We failed to swap the buffer due to a commit taking * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ struct pipe_wait { struct trace_iterator *iter; int wait_index; }; static bool wait_pipe_cond(void *data) { struct pipe_wait *pwait = data; struct trace_iterator *iter = pwait->iter; if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) return true; return iter->closed; } static int wait_on_pipe(struct trace_iterator *iter, int full) { struct pipe_wait pwait; int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; pwait.wait_index = atomic_read_acquire(&iter->wait_index); pwait.iter = iter; ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, wait_pipe_cond, &pwait); #ifdef CONFIG_TRACER_MAX_TRACE /* * Make sure this is still the snapshot buffer, as if a snapshot were * to happen, this would now be the main buffer. */ if (iter->snapshot) iter->array_buffer = &iter->tr->max_buffer; #endif return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST static bool selftests_can_run; struct trace_selftests { struct list_head list; struct tracer *type; }; static LIST_HEAD(postponed_selftests); static int save_selftest(struct tracer *type) { struct trace_selftests *selftest; selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); if (!selftest) return -ENOMEM; selftest->type = type; list_add(&selftest->list, &postponed_selftests); return 0; } static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; struct tracer *saved_tracer = tr->current_trace; int ret; if (!type->selftest || tracing_selftest_disabled) return 0; /* * If a tracer registers early in boot up (before scheduling is * initialized and such), then do not run its selftests yet. * Instead, run it a little later in the boot process. */ if (!selftests_can_run) return save_selftest(type); if (!tracing_is_on()) { pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", type->name); return 0; } /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } #endif /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ WARN_ON(1); return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif printk(KERN_CONT "PASSED\n"); return 0; } static int do_run_tracer_selftest(struct tracer *type) { int ret; /* * Tests can take a long time, especially if they are run one after the * other, as does happen during bootup when all the tracers are * registered. This could cause the soft lockup watchdog to trigger. */ cond_resched(); tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; return ret; } static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; struct tracer *t, **last; int ret; selftests_can_run = true; guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) return 0; pr_info("Running postponed tracer tests:\n"); tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. */ cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { WARN(1, "tracer: %s failed selftest, disabling\n", p->type->name); last = &trace_types; for (t = trace_types; t; t = t->next) { if (t == p->type) { *last = t->next; break; } last = &t->next; } } list_del(&p->list); kfree(p); } tracing_selftest_running = false; return 0; } core_initcall(init_trace_selftests); #else static inline int do_run_tracer_selftest(struct tracer *type) { return 0; } #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. * @type: the plugin for the tracer * * Register a new plugin tracer. */ int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; if (!type->name) { pr_info("Tracer must have a name\n"); return -1; } if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; } } if (!type->set_flag) type->set_flag = &dummy_set_flag; if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); if (!type->flags) { ret = -ENOMEM; goto out; } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; /* store the tracer for __set_tracer_option */ type->flags->trace = type; ret = do_run_tracer_selftest(type); if (ret < 0) goto out; type->next = trace_types; trace_types = type; add_tracer_options(&global_trace, type); out: mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; apply_trace_boot_options(); /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } static void tracing_reset_all_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; if (!buffer) return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); ring_buffer_reset(buffer); ring_buffer_record_enable(buffer); } /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; lockdep_assert_held(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->clear_trace) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } } void tracing_reset_all_online_cpus(void) { guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); } int is_tracing_stopped(void) { return global_trace.stop_count; } static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; if (tracing_disabled) return; guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } return; } /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif arch_spin_unlock(&tr->max_lock); } /** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, * this will start the tracer back up. */ void tracing_start(void) { return tracing_start_tr(&global_trace); } static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif arch_spin_unlock(&tr->max_lock); } /** * tracing_stop - quick stop of the tracer * * Light weight way to stop tracing. Use in conjunction with * tracing_start. */ void tracing_stop(void) { return tracing_stop_tr(&global_trace); } /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function * simplifies those functions and keeps them in sync. */ enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } EXPORT_SYMBOL_GPL(trace_handle_return); static unsigned short migration_disable_value(void) { #if defined(CONFIG_SMP) return current->migration_disabled; #else return 0; #endif } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; unsigned int pc; pc = preempt_count(); if (pc & NMI_MASK) trace_flags |= TRACE_FLAG_NMI; if (pc & HARDIRQ_MASK) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned int trace_ctx) { return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DEFINE_PER_CPU(int, trace_buffered_event_cnt); static int trace_buffered_event_ref; /** * trace_buffered_event_enable - enable buffering events * * When events are being filtered, it is quicker to use a temporary * buffer to write the event data into if there's a likely chance * that it will not be committed. The discard of the ring buffer * is not as fast as committing, and is much slower than copying * a commit. * * When an event is to be filtered, allocate per cpu buffers to * write the event data into, and if the event is filtered and discarded * it is simply dropped, otherwise, the entire data is to be committed * in one shot. */ void trace_buffered_event_enable(void) { struct ring_buffer_event *event; struct page *page; int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (trace_buffered_event_ref++) return; for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); /* This is just an optimization and can handle failures */ if (!page) { pr_err("Failed to allocate event buffer\n"); break; } event = page_address(page); memset(event, 0, sizeof(*event)); per_cpu(trace_buffered_event, cpu) = event; scoped_guard(preempt,) { if (cpu == smp_processor_id() && __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); } } } static void enable_trace_buffered_event(void *data) { this_cpu_dec(trace_buffered_event_cnt); } static void disable_trace_buffered_event(void *data) { this_cpu_inc(trace_buffered_event_cnt); } /** * trace_buffered_event_disable - disable buffering events * * When a filter is removed, it is faster to not use the buffered * events, and to commit directly into the ring buffer. Free up * the temp buffers when there are no more users. This requires * special synchronization with current events. */ void trace_buffered_event_disable(void) { int cpu; WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); if (WARN_ON_ONCE(!trace_buffered_event_ref)) return; if (--trace_buffered_event_ref) return; /* For each CPU, set the buffer as used. */ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } /* * Wait for all CPUs that potentially started checking if they can use * their event buffer only after the previous synchronize_rcu() call and * they still read a valid pointer from trace_buffered_event. It must be * ensured they don't see cleared trace_buffered_event_cnt else they * could wrongly decide to use the pointed-to buffer which is now freed. */ synchronize_rcu(); /* For each CPU, relinquish the buffer */ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, true); } static struct trace_buffer *temp_buffer; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx) { struct ring_buffer_event *entry; struct trace_array *tr = trace_file->tr; int val; *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, * where the type_len is zero and the array[0] will * hold the full length. * (see include/linux/ring-buffer.h for details on * how the ring_buffer_event is structured). * * Using a temp buffer during filtering and copying it * on a matched filter is quicker than writing directly * into the ring buffer and then discarding it when * it doesn't match. That is because the discard * requires several atomic operations to get right. * Copying on match and doing nothing on a failed match * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ if ((entry = __this_cpu_read(trace_buffered_event))) { int max_len = PAGE_SIZE - struct_size(entry, array, 1); val = this_cpu_inc_return(trace_buffered_event_cnt); /* * Preemption is disabled, but interrupts and NMIs * can still come in now. If that happens after * the above increment, then it will have to go * back to the old method of allocating the event * on the ring buffer, and if the filter fails, it * will have to call ring_buffer_discard_commit() * to remove it. * * Need to also check the unlikely case that the * length is bigger than the temp buffer size. * If that happens, then the reserve is pretty much * guaranteed to fail, as the ring buffer currently * only allows events less than a page. But that may * change in the future, so let the ring buffer reserve * handle the failure in that case. */ if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; /* Return with preemption disabled */ return entry; } this_cpu_dec(trace_buffered_event_cnt); } /* __trace_buffer_lock_reserve() disables preemption */ preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; entry = __trace_buffer_lock_reserve(*current_rb, type, len, trace_ctx); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) { struct trace_event_call *event_call; struct trace_event_file *file; struct trace_event *event; unsigned long flags; struct trace_iterator *iter = tracepoint_print_iter; /* We should never get here if iter is NULL */ if (WARN_ON_ONCE(!iter)) return; event_call = fbuffer->trace_file->event_call; if (!event_call || !event_call->event.funcs || !event_call->event.funcs->trace) return; file = fbuffer->trace_file; if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, fbuffer->entry))) return; event = &fbuffer->trace_file->event_call->event; raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; int ret; guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * This will force exiting early, as tracepoint_printk * is always zero when tracepoint_printk_iter is not allocated */ if (!tracepoint_print_iter) tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); return ret; } void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { enum event_trigger_type tt = ETT_NONE; struct trace_event_file *file = fbuffer->trace_file; if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, fbuffer->entry, &tt)) goto discard; if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); discard: if (tt) event_triggers_post_call(file, tt); } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); /* * Skip 3: * * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); /* * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); ftrace_trace_userstack(tr, buffer, trace_ctx); } /* * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. */ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event) { __buffer_unlock_commit(buffer, event); } void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; int size = sizeof(*entry); size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API if (fregs) { for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) entry->args[i] = ftrace_regs_get_argument(fregs, i); } #endif if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 #define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; }; struct ftrace_stacks { struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; /* * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ #ifndef CONFIG_UNWINDER_ORC if (!regs) skip++; #endif guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* * The above __this_cpu_inc_return() is 'atomic' cpu local. An * interrupt will either see the value pre increment or post * increment. If the interrupt happens pre increment it will have * restored the counter when it returns. We just need a barrier to * keep gcc from moving things around. */ barrier(); fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; size = ARRAY_SIZE(fstack->calls); if (regs) { nr_entries = stack_trace_save_regs(regs, fstack->calls, size, skip); } else { nr_entries = stack_trace_save(fstack->calls, size, skip); } #ifdef CONFIG_DYNAMIC_FTRACE /* Mark entry of stack trace as trampoline code */ if (tr->ops && tr->ops->trampoline) { unsigned long tramp_start = tr->ops->trampoline; unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; unsigned long *calls = fstack->calls; for (int i = 0; i < nr_entries; i++) { if (calls[i] >= tramp_start && calls[i] < tramp_end) calls[i] = FTRACE_TRAMPOLINE_MARKER; } } #endif event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->size = nr_entries; memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); } static inline void ftrace_trace_stack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip) { struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) return; /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; ct_irq_enter_irqson(); __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } /** * trace_dump_stack - record a stack back trace in the trace buffer * @skip: Number of functions to skip (helper handlers) */ void trace_dump_stack(int skip) { if (tracing_disabled || tracing_selftest_running) return; #ifndef CONFIG_UNWINDER_ORC /* Skip 1 to skip this function. */ skip++; #endif __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); #ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { struct ring_buffer_event *event; struct userstack_entry *entry; if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* * NMIs can not handle page faults, even with fix ups. * The save user stack can (and often does) fault. */ if (unlikely(in_nmi())) return; /* * prevent recursion, since the user stack tracing may * trigger other kernel events. */ guard(preempt)(); if (__this_cpu_read(user_stack_count)) return; __this_cpu_inc(user_stack_count); event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { } #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ static inline void func_repeats_set_delta_ts(struct func_repeats_entry *entry, unsigned long long delta) { entry->bottom_delta_ts = delta & U32_MAX; entry->top_delta_ts = (delta >> 32); } void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct func_repeats_entry *entry; struct ring_buffer_event *event; u64 delta; event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, sizeof(*entry), trace_ctx); if (!event) return; delta = ring_buffer_event_time_stamp(buffer, event) - last_info->ts_last_call; entry = ring_buffer_event_data(event); entry->ip = last_info->ip; entry->parent_ip = last_info->parent_ip; entry->count = last_info->count; func_repeats_set_delta_ts(entry, delta); __buffer_unlock_commit(buffer, event); } /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; /* Interrupts must see nesting incremented before we use the buffer */ barrier(); return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) { /* Don't let the decrement of nesting leak before this */ barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; trace_percpu_buffer = buffers; return 0; } static int buffers_allocated; void trace_printk_init_buffers(void) { if (buffers_allocated) return; if (alloc_percpu_trace_buffer()) return; /* trace_printk() is for debug use only. Don't use it in production. */ pr_warn("\n"); pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); pr_warn("** **\n"); pr_warn("** This means that this is a DEBUG kernel and it is **\n"); pr_warn("** unsafe for production use. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your vendor! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(&global_trace); buffers_allocated = 1; /* * trace_printk_init_buffers() can be called by modules. * If that happens, then we need to start cmdline recording * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { /* Start tracing comms if trace printk is set */ if (!buffers_allocated) return; tracing_start_cmdline_record(); } static void trace_printk_start_stop_comm(int enabled) { if (!buffers_allocated) return; if (enabled) tracing_start_cmdline_record(); else tracing_stop_cmdline_record(); } /** * trace_vbprintk - write binary msg to tracing buffer * @ip: The address of the caller * @fmt: The string format to write to the buffer * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = READ_ONCE(printk_trace); struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; int len = 0, size; if (!printk_binsafe(tr)) return trace_vprintk(ip, fmt, args); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) goto out_put; size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; scoped_guard(ring_buffer_nest, buffer) { event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, trace_ctx); if (!event) goto out_put; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); } out_put: put_trace_buf(); out_nobuffer: unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); static __printf(3, 0) int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; unsigned int trace_ctx; char *tbuffer; if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; scoped_guard(ring_buffer_nest, buffer) { event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); __buffer_unlock_commit(buffer, event); ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); } out: put_trace_buf(); out_nobuffer: unpause_graph_tracing(); return len; } int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { if (tracing_selftest_running && tr == &global_trace) return 0; return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } /** * trace_array_printk - Print a message to a specific instance * @tr: The instance trace_array descriptor * @ip: The instruction pointer that this is called from. * @fmt: The format to print (printf format) * * If a subsystem sets up its own instance, they have the right to * printk strings into their tracing instance buffer using this * function. Note, this function will not write into the top level * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. * * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return 0; if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(trace_array_printk); /** * trace_array_init_printk - Initialize buffers for trace_array_printk() * @tr: The trace array to initialize the buffers for * * As trace_array_printk() only writes into instances, they are OK to * have in the kernel (unlike trace_printk()). This needs to be called * before trace_array_printk() can be used on a trace_array. */ int trace_array_init_printk(struct trace_array *tr) { if (!tr) return -ENOENT; /* This is only allowed for created instances */ if (tr == &global_trace) return -EINVAL; return alloc_percpu_trace_buffer(); } EXPORT_SYMBOL_GPL(trace_array_init_printk); int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); ret = __trace_array_vprintk(buffer, ip, fmt, ap); va_end(ap); return ret; } int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); iter->idx++; if (buf_iter) ring_buffer_iter_advance(buf_iter); } static struct trace_entry * peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); if (lost_events) *lost_events = ring_buffer_iter_dropped(buf_iter) ? (unsigned long)-1 : 0; } else { event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); } if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); } iter->ent_size = 0; return NULL; } static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; int next_size = 0; int cpu; /* * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; return ent; } for_each_tracing_cpu(cpu) { if (ring_buffer_empty_cpu(buffer, cpu)) continue; ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: */ if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; next_ts = ts; next_lost = lost_events; next_size = iter->ent_size; } } iter->ent_size = next_size; if (ent_cpu) *ent_cpu = next_cpu; if (ent_ts) *ent_ts = next_ts; if (missing_events) *missing_events = next_lost; return next; } #define STATIC_FMT_BUF_SIZE 128 static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; /* * iter->tr is NULL when used with tp_printk, which makes * this get called where it is not safe to call krealloc(). */ if (!iter->tr || iter->fmt == static_fmt_buf) return NULL; tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, GFP_KERNEL); if (tmp) { iter->fmt_size += STATIC_FMT_BUF_SIZE; iter->fmt = tmp; } return tmp; } /* Returns true if the string is safe to dereference from an event */ static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) return true; /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ if (is_kernel_rodata(addr)) return true; if (trace_is_tracepoint_string(str)) return true; /* * Now this could be a module event, referencing core module * data, which is OK. */ if (!iter->ent) return false; trace_event = ftrace_find_event(iter->ent->type); if (!trace_event) return false; event = container_of(trace_event, struct trace_event_call, event); if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ if (within_module_core(addr, event->module)) return true; return false; } /** * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed * * At boot up, test_event_printk() will flag any event that dereferences * a string with "%s" that does exist in the ring buffer. It may still * be valid, as the string may point to a static string in the kernel * rodata that never gets freed. But if the string pointer is pointing * to something that was allocated, there's a chance that it can be freed * by the time the user reads the trace. This would cause a bad memory * access by the kernel and possibly crash the system. * * This function will check if the event has any fields flagged as needing * to be checked at runtime and perform those checks. * * If it is found that a field is unsafe, it will write into the @iter->seq * a message stating what was found to be unsafe. * * @return: true if the event is unsafe and should be ignored, * false otherwise. */ bool ignore_event(struct trace_iterator *iter) { struct ftrace_event_field *field; struct trace_event *trace_event; struct trace_event_call *event; struct list_head *head; struct trace_seq *seq; const void *ptr; trace_event = ftrace_find_event(iter->ent->type); seq = &iter->seq; if (!trace_event) { trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); return true; } event = container_of(trace_event, struct trace_event_call, event); if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) return false; head = trace_get_fields(event); if (!head) { trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", trace_event_name(event)); return true; } /* Offsets are from the iter->ent that points to the raw event */ ptr = iter->ent; list_for_each_entry(field, head, link) { const char *str; bool good; if (!field->needs_test) continue; str = *(const char **)(ptr + field->offset); good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the * trace event in question used %s on a string that * was saved at the time of the event, but may not be * around when the trace is read. Use __string(), * __assign_str() and __get_str() helpers in the TRACE_EVENT() * instead. See samples/trace_events/trace-events-sample.h * for reference. */ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", trace_event_name(event), field->name)) { trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", trace_event_name(event), field->name); return true; } } return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; char *q; if (WARN_ON_ONCE(!fmt)) return fmt; if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) return fmt; p = fmt; new_fmt = q = iter->fmt; while (*p) { if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { if (!trace_iter_expand_format(iter)) return fmt; q += iter->fmt - new_fmt; new_fmt = iter->fmt; } *q++ = *p++; /* Replace %p with %px */ if (p[-1] == '%') { if (p[0] == '%') { *q++ = *p++; } else if (p[0] == 'p' && !isalnum(p[1])) { *q++ = *p++; *q++ = 'x'; } } } *q = '\0'; return new_fmt; } #define STATIC_TEMP_BUF_SIZE 128 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { /* __find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_entry *entry; /* * If called from ftrace_dump(), then the iter->temp buffer * will be the static_temp_buf and not created from kmalloc. * If the entry size is greater than the buffer, we can * not save it. Just return NULL in that case. This is only * used to add markers when two consecutive events' time * stamps have a large delta. See trace_print_lat_context() */ if (iter->temp == static_temp_buf && STATIC_TEMP_BUF_SIZE < ent_size) return NULL; /* * The __find_next_entry() may call peek_next_entry(), which may * call ring_buffer_peek() that may make the contents of iter->ent * undefined. Need to copy iter->ent now. */ if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { void *temp; temp = kmalloc(iter->ent_size, GFP_KERNEL); if (!temp) return NULL; kfree(iter->temp); iter->temp = temp; iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); /* Put back the original ent_size */ iter->ent_size = ent_size; return entry; } /* Find the next real entry, and increment the iterator to the next entry */ void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); return iter->ent ? iter : NULL; } static void trace_consume(struct trace_iterator *iter) { ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; int i = (int)*pos; void *ent; WARN_ON_ONCE(iter->leftover); (*pos)++; /* can't go backwards */ if (iter->idx > i) return NULL; if (iter->idx < 0) ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) return; ring_buffer_iter_reset(buf_iter); /* * We could have the case with the max latency tracers * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ while (ring_buffer_iter_peek(buf_iter, &ts)) { if (ts >= iter->array_buffer->time_start) break; entries++; ring_buffer_iter_advance(buf_iter); /* This could be a big loop */ cond_resched(); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* * The current tracer is copied to avoid a global locking * all around. */ static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; mutex_lock(&trace_types_lock); if (unlikely(tr->current_trace != iter->trace)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); iter->trace = tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); #endif if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; iter->idx = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { /* * If we overflowed the seq_file before, then we want * to just reuse the trace_seq buffer again. */ if (iter->leftover) p = iter; else { l = *pos - 1; p = s_next(m, p, &l); } } trace_event_read_lock(); trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; #endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, unsigned long *entries, int cpu) { unsigned long count; count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total = count; } else *total = count + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries = count; } static void get_total_entries(struct array_buffer *buf, unsigned long *total, unsigned long *entries) { unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { get_total_entries_cpu(buf, &t, &e, cpu); *total += t; *entries += e; } } unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); return entries; } unsigned long trace_total_entries(struct trace_array *tr) { unsigned long total, entries; if (!tr) tr = &global_trace; get_total_entries(&tr->array_buffer, &total, &entries); return entries; } static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" "# |||| / _-=> migrate-disable \n" "# ||||| / delay \n" "# cmd pid |||||| time | caller \n" "# \\ / |||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; print_event_info(buf, m); seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); seq_printf(m, "# %.*s|||| / delay\n", prec, space); seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct array_buffer *buf = iter->array_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = type->name; get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", nsecs_to_usecs(data->saved_latency), entries, total, buf->cpu, preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP seq_printf(m, " #P:%d)\n", num_online_cpus()); #else seq_puts(m, ")\n"); #endif seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", data->comm, data->pid, from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); if (data->critical_start) { seq_puts(m, "# => started at: "); seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); } static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_array *tr = iter->tr; if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; if (cpumask_available(iter->started) && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) return; if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; entry = iter->ent; test_cpu_buff_start(iter); event = ftrace_find_event(entry->type); if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else trace_print_context(iter); } if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); /* * For TRACE_EVENT() events, the print_fmt is not * safe to use if the array has delta offsets * Force printing via the fields. */ if ((tr->text_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); return event->funcs->trace(iter, sym_flags, event); } trace_seq_printf(s, "Unknown type %d\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); trace_seq_printf(s, "%d ?\n", entry->type); return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); if (event) { enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } SEQ_PUT_FIELD(s, newline); return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); return event ? event->funcs->binary(iter, 0, event) : TRACE_TYPE_HANDLED; } int trace_empty(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } return 1; } for_each_tracing_cpu(cpu) { buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } } return 1; } /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { if (iter->lost_events == (unsigned long)-1) trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", iter->cpu); else trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", iter->cpu, iter->lost_events); if (trace_seq_has_overflowed(&iter->seq)) return TRACE_TYPE_PARTIAL_LINE; } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) return ret; } if (iter->ent->type == TRACE_BPUTS && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) return trace_print_printk_msg_only(iter); if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); if (trace_flags & TRACE_ITER_HEX) return print_hex_fmt(iter); if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); return print_trace_fmt(iter); } void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; print_trace_header(m, iter); if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) print_func_help_header_irq(iter->array_buffer, m, trace_flags); else print_func_help_header(iter->array_buffer, m, trace_flags); } } } static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer.\n" "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" "# Takes a snapshot of the main buffer for this cpu.\n"); #else seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" "# Must use main snapshot file to allocate.\n"); #endif seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" "# (Doesn't have to be '2' works with any number that\n" "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else show_snapshot_percpu_help(m); } #else /* Should never be called */ static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } #endif static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; int ret; if (iter->ent == NULL) { if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); test_ftrace_alive(m); } if (iter->snapshot && trace_empty(iter)) print_snapshot_help(m, iter); else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); } else if (iter->leftover) { /* * If we filled the seq_file buffer earlier, we * want to just show it now. */ ret = trace_print_seq(m, &iter->seq); /* ret should this time be zero, but you never know */ iter->leftover = ret; } else { ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will * ask us for this data again at start up. * Use that instead. * ret is 0 if seq_file write succeeded. * -1 otherwise. */ iter->leftover = ret; } return 0; } /* * Should be used after trace_array_get(), trace_types_lock * ensures that i_cdev was already initialized. */ static inline int tracing_get_cpu(struct inode *inode) { if (inode->i_cdev) /* See trace_create_cpu_file() */ return (long)inode->i_cdev - 1; return RING_BUFFER_ALL_CPUS; } static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; /* * Note, as iter itself can be allocated and freed in different * ways, this function is only used to free its content, and not * the iterator itself. The only requirement to all the allocations * is that it must zero all fields (kzalloc), as freeing works with * ethier allocated content or NULL. */ static void free_trace_iter_content(struct trace_iterator *iter) { /* The fmt is either NULL, allocated or points to static_fmt_buf */ if (iter->fmt != static_fmt_buf) kfree(iter->fmt); kfree(iter->temp); kfree(iter->buffer_iter); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); } static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; /* * trace_find_next_entry() may need to save off iter->ent. * It will place it into the iter->temp buffer. As most * events are less than 128, allocate a buffer of that size. * If one is greater, then trace_find_next_entry() will * allocate a new buffer to adjust for the bigger iter->ent. * It's not critical if it fails to get allocated here. */ iter->temp = kmalloc(128, GFP_KERNEL); if (iter->temp) iter->temp_size = 128; /* * trace_event_printf() may need to modify given format * string to replace %p with %px so that it shows real address * instead of hash value. However, that is only for the event * tracing, other tracer may not need. Defer the allocation * until it is needed. */ iter->fmt = NULL; iter->fmt_size = 0; mutex_lock(&trace_types_lock); iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; iter->tr = tr; #ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) iter->array_buffer = &tr->max_buffer; else #endif iter->array_buffer = &tr->array_buffer; iter->snapshot = snapshot; iter->pos = -1; iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); /* Notify the tracer early; before we stop tracing. */ if (iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_start(iter->array_buffer->buffer, cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_start(iter->array_buffer->buffer, cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } mutex_unlock(&trace_types_lock); return iter; fail: mutex_unlock(&trace_types_lock); free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; filp->private_data = inode->i_private; return 0; } bool tracing_is_disabled(void) { return (tracing_disabled) ? true: false; } /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } /* * The private pointer of the inode is the trace_event_file. * Update the tr ref count associated to it. */ int tracing_open_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); if (ret) return ret; guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); return -ENODEV; } else { event_file_get(file); } filp->private_data = inode->i_private; return 0; } int tracing_release_file_tr(struct inode *inode, struct file *filp) { struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); event_file_put(file); return 0; } int tracing_single_release_file_tr(struct inode *inode, struct file *filp) { tracing_release_file_tr(inode, filp); return single_release(inode, filp); } static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; if (!(file->f_mode & FMODE_READ)) { trace_array_put(tr); return 0; } /* Writes do not use seq_file */ iter = m->private; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } if (iter->trace && iter->trace->close) iter->trace->close(iter); if (!iter->snapshot && tr->stop_count) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); __trace_array_put(tr); mutex_unlock(&trace_types_lock); free_trace_iter_content(iter); seq_release_private(inode, file); return 0; } int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return 0; } static int tracing_single_release_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return single_release(inode, file); } static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); struct array_buffer *trace_buf = &tr->array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) trace_buf = &tr->max_buffer; #endif if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } if (ret < 0) trace_array_put(tr); return ret; } /* * Some tracers are not suitable for instance buffers. * A tracer is always available for the global array (toplevel) * or if it explicitly states that it is. */ static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { #ifdef CONFIG_TRACER_SNAPSHOT /* arrays with mapped buffer range do not have snapshots */ if (tr->range_addr_start && t->use_max_tr) return false; #endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } /* Find the next tracer that this trace array may use */ static struct tracer * get_tracer_for_array(struct trace_array *tr, struct tracer *t) { while (t && !trace_ok_for_array(t, tr)) t = t->next; return t; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); t = get_tracer_for_array(tr, trace_types); for (; t && l < *pos; t = t_next(m, t, &l)) ; return t; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&trace_types_lock); } static int t_show(struct seq_file *m, void *v) { struct tracer *t = v; if (!t) return 0; seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else seq_putc(m, '\n'); return 0; } static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, .show = t_show, }; static int show_traces_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = seq_open(file, &show_traces_seq_ops); if (ret) { trace_array_put(tr); return ret; } m = file->private_data; m->private = tr; return 0; } static int tracing_seq_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { return count; } loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { int ret; if (file->f_mode & FMODE_READ) ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 0; return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, }; static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_seq_release, }; static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); if (!mask_str) return -ENOMEM; len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) return -EINVAL; return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, cpumask_var_t tracing_cpumask_new) { int cpu; if (!tr) return -EINVAL; local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); #endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); #endif } } arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); return 0; } static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; int err; if (count == 0 || count > KMALLOC_MAX_SIZE) return -EINVAL; if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_free; err = tracing_set_cpumask(tr, tracing_cpumask_new); if (err) goto err_free; free_cpumask_var(tracing_cpumask_new); return count; err_free: free_cpumask_var(tracing_cpumask_new); return err; } static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; u32 tracer_flags; int i; guard(mutex)(&trace_types_lock); tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); else seq_printf(m, "no%s\n", trace_opts[i].name); } return 0; } static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; if (neg) tracer_flags->val &= ~opts->bit; else tracer_flags->val |= opts->bit; return 0; } /* Try to assign a tracer specific option */ static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) { if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) return -1; return 0; } int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD) || (mask == TRACE_ITER_TRACE_PRINTK) || (mask == TRACE_ITER_COPY_MARKER)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (mask == TRACE_ITER_TRACE_PRINTK) { if (enabled) { update_printk_trace(tr); } else { /* * The global_trace cannot clear this. * It's flag only gets cleared if another instance sets it. */ if (printk_trace == &global_trace) return -EINVAL; /* * An instance must always have it set. * by default, that's the global_trace instane. */ if (printk_trace == tr) update_printk_trace(&global_trace); } } if (mask == TRACE_ITER_COPY_MARKER) update_marker_trace(tr, enabled); if (enabled) tr->trace_flags |= mask; else tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } trace_event_enable_tgid_record(enabled); } if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); if (mask == TRACE_ITER_FUNC_FORK) ftrace_pid_follow_fork(tr, enabled); if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); trace_printk_control(enabled); } return 0; } int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; int ret; size_t orig_len = strlen(option); int len; cmp = strstrip(option); len = str_has_prefix(cmp, "no"); if (len) neg = 1; cmp += len; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ if (ret < 0) ret = set_tracer_option(tr, cmp, neg); else ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, * turn it back into a space. */ if (orig_len > strlen(option)) option[strlen(option)] = ' '; return ret; } static void __init apply_trace_boot_options(void) { char *buf = trace_boot_options_buf; char *option; while (true) { option = strsep(&buf, ","); if (!option) break; if (*option) trace_set_options(&global_trace, option); /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; ret = trace_set_options(tr, buf); if (ret < 0) return ret; *ppos += cnt; return cnt; } static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "By default tracefs removes all OTH file permission bits.\n" "When mounting tracefs an optional group id can be specified\n" "which adds the group to every directory and file in tracefs:\n\n" "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" " trace\t\t\t- The static contents of the buffer\n" "\t\t\t To clear the buffer write into this file: echo > trace\n" " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" " uptime: Jiffy counter from time of boot\n" " perf: Same clock that perf events use\n" #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:<module-name>\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" "\t triggers: a command to perform when function is hit\n" "\t Format: <function>:<trigger>[:count]\n" "\t trigger: traceon, traceoff\n" "\t\t enable_event:<system>:<event>\n" "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif "\t\t dump\n" "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" "\t The second will disable tracing at most 3 times when do_trap is hit\n" "\t The first time do trap is hit and it disables tracing, the\n" "\t counter will decrement to 2. If tracing is already disabled,\n" "\t the counter will not decrement. It only decrements when the\n" "\t trigger did work\n" "\t To remove trigger without count:\n" "\t echo '!<function>:<trigger> > set_ftrace_filter\n" "\t To remove trigger with a count:\n" "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" "\t modules: Can select a group via module command :mod:\n" "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" "\t\t (function)\n" " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" "\t\t\t snapshot buffer. Read the contents for more\n" "\t\t\t information\n" #endif #ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a\n" "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" #endif #ifdef CONFIG_FPROBE_EVENTS "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n" "\t -:[<group>/][<event>]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t <argname>[->field[->field|.field...]],\n" #endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the <attached-group>/<attached-event>.\n" #endif " set_event\t\t- Enables events by name written into it\n" "\t\t\t Can enable module events via: :mod:<module>\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" "\t\t\t events\n" " filter\t\t- If set, only events passing filter are traced\n" " events/<system>/<event>/\t- Directory containing control files for\n" "\t\t\t <event>:\n" " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" " filter\t\t- If set, only events passing filter are traced\n" " trigger\t\t- If set, a command to perform when event is hit\n" "\t Format: <trigger>[:count][if <filter>]\n" "\t trigger: traceon, traceoff\n" "\t enable_event:<system>:<event>\n" "\t disable_event:<system>:<event>\n" #ifdef CONFIG_HIST_TRIGGERS "\t enable_hist:<system>:<event>\n" "\t disable_hist:<system>:<event>\n" #endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif #ifdef CONFIG_HIST_TRIGGERS "\t\t hist (see below)\n" #endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" "\t events/block/block_unplug/trigger\n" "\t The first disables tracing every time block_unplug is hit.\n" "\t The second disables tracing the first 3 times block_unplug is hit.\n" "\t The third enables the kmalloc event the first 3 times block_unplug\n" "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" "\t Like function triggers, the counter is only decremented if it\n" "\t enabled or disabled tracing.\n" "\t To remove a trigger without a count:\n" "\t echo '!<trigger> > <system>/<event>/trigger\n" "\t To remove a trigger with a count:\n" "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" "\t Filters can be ignored when removing a trigger.\n" #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=<field1[,field2,...]>\n" "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n" "\t [:values=<field1[,field2,...]>]\n" "\t [:sort=<field1[,field2,...]>]\n" "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" "\t [:nohitcount]\n" "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t Note, special fields can be used as well:\n" "\t common_timestamp - to record current timestamp\n" "\t common_cpu - to record the CPU the event happened on\n" "\n" "\t A hist trigger variable can be:\n" "\t - a reference to a field e.g. x=current_timestamp,\n" "\t - a reference to another variable e.g. y=$x,\n" "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" "\t specified using the 'sort' keyword. The sort direction can\n" "\t be modified by appending '.descending' or '.ascending' to a\n" "\t sort field. The 'size' parameter can be used to specify more\n" "\t or fewer than the default 2048 entries for the hashtable size.\n" "\t If a hist trigger is given a name using the 'name' parameter,\n" "\t its histogram data will be shared with other triggers of the\n" "\t same name, and trigger hits will update this common data.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. If there are multiple hist\n" "\t triggers attached to an event, there will be a table for each\n" "\t trigger in the output. The table displayed for a named\n" "\t trigger will be the same as any other instance having the\n" "\t same name. The default format used to display a given field\n" "\t can be modified by appending any of the following modifiers\n" "\t to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n" "\t .percent display a number of percentage value\n" "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" "\t restart a paused hist trigger.\n\n" "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" "\t raw hitcount in the histogram.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" "\t already-attached hist trigger. The syntax is analogous to\n" "\t the enable_event and disable_event triggers.\n\n" "\t Hist trigger handlers and actions are executed whenever a\n" "\t a histogram entry is added or updated. They take the form:\n\n" "\t <handler>.<action>\n\n" "\t The available handlers are:\n\n" "\t onmatch(matching.event) - invoke on addition or update\n" "\t onmax(var) - invoke if var exceeds current max\n" "\t onchange(var) - invoke action if var changes\n\n" "\t The available actions are:\n\n" "\t trace(<synthetic_event>,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT "\t snapshot() - snapshot the trace buffer\n\n" #endif #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif #endif ; static ssize_t tracing_readme_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return simple_read_from_buffer(ubuf, cnt, ppos, readme_msg, strlen(readme_msg)); } static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ ptr++; } else return NULL; } return ptr; } static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ (*pos)++; ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; ptr++; ptr = update_eval_map(ptr); return ptr; } static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) v++; while (v && l < *pos) { v = eval_map_next(m, v, &l); } return v; } static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; } static const struct seq_operations tracing_eval_map_seq_ops = { .start = eval_map_start, .next = eval_map_next, .stop = eval_map_stop, .show = eval_map_show, }; static int tracing_eval_map_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } static const struct file_operations tracing_eval_map_fops = { .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static inline union trace_eval_map_item * trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; } static void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; struct trace_eval_map **map; union trace_eval_map_item *map_array; union trace_eval_map_item *ptr; stop = start + len; /* * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; } guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; else { ptr = trace_eval_maps; for (;;) { ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; } ptr->tail.next = map_array; } map_array->head.mod = mod; map_array->head.length = len; map_array++; for (map = start; (unsigned long)map < (unsigned long)stop; map++) { map_array->map = **map; map_array++; } memset(map_array, 0, sizeof(*map_array)); } static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_event_update_with_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; /* Always run sanitizer only if btf_type_tag attr exists. */ if (len <= 0) { if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && __has_attribute(btf_type_tag))) return; } map = start; trace_event_update_all(map, len); if (len <= 0) return; trace_insert_eval_map_file(mod, start, len); } static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; scoped_guard(mutex, &trace_types_lock) { r = sprintf(buf, "%s\n", tr->current_trace->name); } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } int tracer_init(struct tracer *t, struct trace_array *tr) { tracing_reset_online_cpus(&tr->array_buffer); return t->init(tr); } static void set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; for_each_tracing_cpu(cpu) per_cpu_ptr(buf->data, cpu)->entries = val; } static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; per_cpu_ptr(trace_buf->data, cpu)->entries = per_cpu_ptr(size_buf->data, cpu)->entries; } } else { ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) per_cpu_ptr(trace_buf->data, cpu_id)->entries = per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } #endif /* CONFIG_TRACER_MAX_TRACE */ static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { int ret; /* * If kernel or user changes the size of the ring buffer * we use the size that was given, and we can forget about * expanding it later. */ trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) return 0; /* Do not allow tracing while resizing ring buffer */ tracing_stop_tr(tr); ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { int r = resize_buffer_duplicate_size(&tr->array_buffer, &tr->array_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different * size max buffer!!!! * The max buffer is our "snapshot" buffer. * When a tracer needs a snapshot (one of the * latency tracers), it swaps the max buffer * with the saved snap shot. We succeeded to * update the size of the main buffer, but failed to * update the size of the max buffer. But when we tried * to reset the main buffer to the original size, we * failed there too. This is very unlikely to * happen, but if it does, warn and kill all * tracing. */ WARN_ON(1); tracing_disabled = 1; } goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); out_start: tracing_start_tr(tr); return ret; } ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) return -EINVAL; } return __tracing_resize_ring_buffer(tr, size, cpu_id); } struct trace_mod_entry { unsigned long mod_addr; char mod_name[MODULE_NAME_LEN]; }; struct trace_scratch { unsigned int clock_id; unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; }; static DEFINE_MUTEX(scratch_mutex); static int cmp_mod_entry(const void *key, const void *pivot) { unsigned long addr = (unsigned long)key; const struct trace_mod_entry *ent = pivot; if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) return 0; else return addr - ent->mod_addr; } /** * trace_adjust_address() - Adjust prev boot address to current address. * @tr: Persistent ring buffer's trace_array. * @addr: Address in @tr which is adjusted. */ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) { struct trace_module_delta *module_delta; struct trace_scratch *tscratch; struct trace_mod_entry *entry; unsigned long raddr; int idx = 0, nr_entries; /* If we don't have last boot delta, return the address */ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return addr; /* tr->module_delta must be protected by rcu. */ guard(rcu)(); tscratch = tr->scratch; /* if there is no tscrach, module_delta must be NULL. */ module_delta = READ_ONCE(tr->module_delta); if (!module_delta || !tscratch->nr_entries || tscratch->entries[0].mod_addr > addr) { raddr = addr + tr->text_delta; return __is_kernel(raddr) || is_kernel_core_data(raddr) || is_kernel_rodata(raddr) ? raddr : addr; } /* Note that entries must be sorted. */ nr_entries = tscratch->nr_entries; if (nr_entries == 1 || tscratch->entries[nr_entries - 1].mod_addr < addr) idx = nr_entries - 1; else { entry = __inline_bsearch((void *)addr, tscratch->entries, nr_entries - 1, sizeof(tscratch->entries[0]), cmp_mod_entry); if (entry) idx = entry - tscratch->entries; } return addr + module_delta->delta[idx]; } #ifdef CONFIG_MODULES static int save_mod(struct module *mod, void *data) { struct trace_array *tr = data; struct trace_scratch *tscratch; struct trace_mod_entry *entry; unsigned int size; tscratch = tr->scratch; if (!tscratch) return -1; size = tr->scratch_size; if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) return -1; entry = &tscratch->entries[tscratch->nr_entries]; tscratch->nr_entries++; entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; strscpy(entry->mod_name, mod->name); return 0; } #else static int save_mod(struct module *mod, void *data) { return 0; } #endif static void update_last_data(struct trace_array *tr) { struct trace_module_delta *module_delta; struct trace_scratch *tscratch; if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) return; if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; /* Only if the buffer has previous boot data clear and update it. */ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; /* Reset the module list and reload them */ if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; tscratch->clock_id = tr->clock_id; memset(tscratch->entries, 0, flex_array_size(tscratch, entries, tscratch->nr_entries)); tscratch->nr_entries = 0; guard(mutex)(&scratch_mutex); module_for_each_mod(save_mod, tr); } /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot * as that will cause a confusing trace. Need to clear all * CPU buffers, even for those that may currently be offline. */ tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; if (!tr->scratch) return; tscratch = tr->scratch; module_delta = READ_ONCE(tr->module_delta); WRITE_ONCE(tr->module_delta, NULL); kfree_rcu(module_delta, rcu); /* Set the persistent ring buffer meta data to this address */ tscratch->text_addr = (unsigned long)_text; } /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once * a user starts to use the tracing facility, then they need to grow * to their default size. * * This function is to be called when a tracer is about to be used. */ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); return ret; } struct trace_option_dentry; static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* * Used to clear out the tracer before deletion of an instance. * Must have trace_types_lock held. */ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); tr->current_trace = &nop_trace; } static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) return; /* Only create trace option files after update_tracer_options finish */ if (!tracer_options_updated) return; create_trace_option_files(tr, t); } int tracing_set_tracer(struct trace_array *tr, const char *buf) { struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif int ret; guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; ret = 0; } for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) break; } if (!t) return -EINVAL; if (t == tr->current_trace) return 0; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) return ret; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ if (tr->trace_ref) return -EBUSY; trace_branch_disable(); tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); #ifdef CONFIG_TRACER_MAX_TRACE had_max_tr = tr->current_trace->use_max_tr; /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from * swapping the buffers after we resize it. * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ synchronize_rcu(); free_snapshot(tr); tracing_disarm_snapshot(tr); } if (!had_max_tr && t->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; } #else tr->current_trace = &nop_trace; #endif if (t->init) { ret = tracer_init(t, tr); if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr) tracing_disarm_snapshot(tr); #endif return ret; } } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); return 0; } static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; char *name; size_t ret; int err; ret = cnt; if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; name = strim(buf); err = tracing_set_tracer(tr, name); if (err) return err; *ppos += ret; return ret; } static ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; r = snprintf(buf, sizeof(buf), "%ld\n", *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); if (r > sizeof(buf)) r = sizeof(buf); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; *ptr = val * 1000; return cnt; } static ssize_t tracing_thresh_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); } static ssize_t tracing_thresh_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; int ret; guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) return ret; } return cnt; } #ifdef CONFIG_TRACER_MAX_TRACE static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); } #endif static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { if (cpumask_empty(tr->pipe_cpumask)) { cpumask_setall(tr->pipe_cpumask); return 0; } } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { cpumask_set_cpu(cpu, tr->pipe_cpumask); return 0; } return -EBUSY; } static void close_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { WARN_ON(!cpumask_full(tr->pipe_cpumask)); cpumask_clear(tr->pipe_cpumask); } else { WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); cpumask_clear_cpu(cpu, tr->pipe_cpumask); } } static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; goto fail_alloc_iter; } trace_seq_init(&iter->seq); iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; goto fail; } /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->tr = tr; iter->array_buffer = &tr->array_buffer; iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; if (iter->trace->pipe_open) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); tr->trace_ref++; return ret; fail: kfree(iter); fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; scoped_guard(mutex, &trace_types_lock) { tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); close_pipe_on_cpu(tr, iter->cpu_file); } free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); return 0; } static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return EPOLLIN | EPOLLRDNORM; if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, filp, poll_table, iter->tr->buffer_percent); } static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; return trace_poll(iter, filp, poll_table); } /* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; int ret; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { return -EAGAIN; } /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, * we give an EOF when tracing is again disabled. * * iter->pos will be 0 if we haven't read anything. */ if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); ret = wait_on_pipe(iter, 0); mutex_lock(&iter->mutex); if (ret) return ret; } return 1; } static bool update_last_data_if_empty(struct trace_array *tr) { if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return false; if (!ring_buffer_empty(tr->array_buffer.buffer)) return false; /* * If the buffer contains the last boot data and all per-cpu * buffers are empty, reset it from the kernel side. */ update_last_data(tr); return true; } /* * Consumer reader. */ static ssize_t tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; ssize_t sret; /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) return sret; } waitagain: if (update_last_data_if_empty(iter->tr)) return 0; sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; /* stop when tracing is finished */ if (trace_empty(iter)) return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); trace_event_read_lock(); trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* * If one print_trace_line() fills entire trace_seq in one shot, * trace_seq_to_user() will returns -EBUSY because save_len == 0, * In this case, we need to consume it, otherwise, loop will peek * this event next time, resulting in an infinite loop. */ if (save_len == 0) { iter->seq.full = 0; trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); trace_consume(iter); break; } /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); if (trace_seq_used(&iter->seq) >= cnt) break; /* * Setting the full flag means we reached the trace_seq buffer * size and we should leave by partial output condition above. * One of the trace_seq_* functions is not used properly. */ WARN_ONCE(iter->seq.full, "full flag set for trace type %d", iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) goto waitagain; return sret; } static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { __free_page(spd->pages[idx]); } static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (trace_seq_has_overflowed(&iter->seq)) { iter->seq.seq.len = save_len; break; } /* * This should not be hit, because it should only * be set if the iter->seq overflowed. But check it * anyway to be safe. */ if (ret == TRACE_TYPE_PARTIAL_LINE) { iter->seq.seq.len = save_len; break; } count = trace_seq_used(&iter->seq) - save_len; if (rem < count) { rem = 0; iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; } } return rem; } static ssize_t tracing_splice_read_pipe(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct page *pages_def[PIPE_DEF_BUFFERS]; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; size_t rem; unsigned int i; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; mutex_lock(&iter->mutex); if (iter->trace->splice_read) { ret = iter->trace->splice_read(iter, filp, ppos, pipe, len, flags); if (ret) goto out_err; } ret = tracing_wait_pipe(filp); if (ret <= 0) goto out_err; if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } trace_event_read_lock(); trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), min((size_t)trace_seq_used(&iter->seq), (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; spd.partial[i].len = ret; trace_seq_init(&iter->seq); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; if (i) ret = splice_to_pipe(pipe, &spd); else ret = 0; out: splice_shrink_spd(&spd); return ret; out_err: mutex_unlock(&iter->mutex); goto out; } static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; size = 0; buf_size_same = 1; /* check if all cpu sizes are same */ for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { buf_size_same = 0; break; } } if (buf_size_same) { if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); else r = sprintf(buf, "%lu\n", size >> 10); } else r = sprintf(buf, "X\n"); } else r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry */ if (!val) return -EINVAL; /* value is in KB */ val <<= 10; ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; *ppos += cnt; return cnt; } static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r, cpu; unsigned long size = 0, expanded_size = 0; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } #define LAST_BOOT_HEADER ((void *)1) static void *l_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_scratch *tscratch = tr->scratch; unsigned int index = *pos; (*pos)++; if (*pos == 1) return LAST_BOOT_HEADER; /* Only show offsets of the last boot data */ if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return NULL; /* *pos 0 is for the header, 1 is for the first module */ index--; if (index >= tscratch->nr_entries) return NULL; return &tscratch->entries[index]; } static void *l_start(struct seq_file *m, loff_t *pos) { mutex_lock(&scratch_mutex); return l_next(m, NULL, pos); } static void l_stop(struct seq_file *m, void *p) { mutex_unlock(&scratch_mutex); } static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) { struct trace_scratch *tscratch = tr->scratch; /* * Do not leak KASLR address. This only shows the KASLR address of * the last boot. When the ring buffer is started, the LAST_BOOT * flag gets cleared, and this should only report "current". * Otherwise it shows the KASLR address from the previous boot which * should not be the same as the current boot. */ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); else seq_puts(m, "# Current\n"); } static int l_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; struct trace_mod_entry *entry = v; if (v == LAST_BOOT_HEADER) { show_last_boot_header(m, tr); return 0; } seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); return 0; } static const struct seq_operations last_boot_seq_ops = { .start = l_start, .next = l_next, .stop = l_stop, .show = l_show, }; static int tracing_last_boot_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = seq_open(file, &last_boot_seq_ops); if (ret) { trace_array_put(tr); return ret; } m = file->private_data; m->private = tr; return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; int cpu = tracing_get_cpu(inode); int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); if (ret < 0) __trace_array_put(tr); return ret; } static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { /* * There is no need to read what the user has written, this function * is just to make sure that there is no error when "echo" is used */ *ppos += cnt; return cnt; } static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; /* disable tracing ? */ if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); trace_array_put(tr); return 0; } #define TRACE_MARKER_MAX_SIZE 4096 static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, size_t cnt, unsigned long ip) { struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; int meta_size; ssize_t written; size_t size; meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); if (unlikely(!event)) { /* * If the size was greater than what was allowed, then * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) return -EBADF; goto again; } /* Ring buffer disabled, return as if not open for write */ return -EBADF; } entry = ring_buffer_event_data(event); entry->ip = ip; memcpy(&entry->buf, buf, cnt); written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; } else entry->buf[cnt] = '\0'; if (static_branch_unlikely(&trace_marker_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) event_triggers_post_call(tr->trace_marker_file, tt); return written; } struct trace_user_buf { char *buf; }; struct trace_user_buf_info { struct trace_user_buf __percpu *tbuf; int ref; }; static DEFINE_MUTEX(trace_user_buffer_mutex); static struct trace_user_buf_info *trace_user_buffer; static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) { char *buf; int cpu; for_each_possible_cpu(cpu) { buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; kfree(buf); } free_percpu(tinfo->tbuf); kfree(tinfo); } static int trace_user_fault_buffer_enable(void) { struct trace_user_buf_info *tinfo; char *buf; int cpu; guard(mutex)(&trace_user_buffer_mutex); if (trace_user_buffer) { trace_user_buffer->ref++; return 0; } tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; tinfo->tbuf = alloc_percpu(struct trace_user_buf); if (!tinfo->tbuf) { kfree(tinfo); return -ENOMEM; } tinfo->ref = 1; /* Clear each buffer in case of error */ for_each_possible_cpu(cpu) { per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; } for_each_possible_cpu(cpu) { buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, cpu_to_node(cpu)); if (!buf) { trace_user_fault_buffer_free(tinfo); return -ENOMEM; } per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; } trace_user_buffer = tinfo; return 0; } static void trace_user_fault_buffer_disable(void) { struct trace_user_buf_info *tinfo; guard(mutex)(&trace_user_buffer_mutex); tinfo = trace_user_buffer; if (WARN_ON_ONCE(!tinfo)) return; if (--tinfo->ref) return; trace_user_fault_buffer_free(tinfo); trace_user_buffer = NULL; } /* Must be called with preemption disabled */ static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, const char __user *ptr, size_t size, size_t *read_size) { int cpu = smp_processor_id(); char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; unsigned int cnt; int trys = 0; int ret; if (size > TRACE_MARKER_MAX_SIZE) size = TRACE_MARKER_MAX_SIZE; *read_size = 0; /* * This acts similar to a seqcount. The per CPU context switches are * recorded, migration is disabled and preemption is enabled. The * read of the user space memory is copied into the per CPU buffer. * Preemption is disabled again, and if the per CPU context switches count * is still the same, it means the buffer has not been corrupted. * If the count is different, it is assumed the buffer is corrupted * and reading must be tried again. */ do { /* * If for some reason, copy_from_user() always causes a context * switch, this would then cause an infinite loop. * If this task is preempted by another user space task, it * will cause this task to try again. But just in case something * changes where the copying from user space causes another task * to run, prevent this from going into an infinite loop. * 100 tries should be plenty. */ if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) return NULL; /* Read the current CPU context switch counter */ cnt = nr_context_switches_cpu(cpu); /* * Preemption is going to be enabled, but this task must * remain on this CPU. */ migrate_disable(); /* * Now preemption is being enabed and another task can come in * and use the same buffer and corrupt our data. */ preempt_enable_notrace(); ret = __copy_from_user(buffer, ptr, size); preempt_disable_notrace(); migrate_enable(); /* if it faulted, no need to test if the buffer was corrupted */ if (ret) return NULL; /* * Preemption is disabled again, now check the per CPU context * switch counter. If it doesn't match, then another user space * process may have schedule in and corrupted our buffer. In that * case the copying must be retried. */ } while (nr_context_switches_cpu(cpu) != cnt); *read_size = size; return buffer; } static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; unsigned long ip; size_t size; char *buf; if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if ((ssize_t)cnt < 0) return -EINVAL; if (cnt > TRACE_MARKER_MAX_SIZE) cnt = TRACE_MARKER_MAX_SIZE; /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); if (!buf) return -EFAULT; if (cnt > size) cnt = size; /* The selftests expect this function to be the IP address */ ip = _THIS_IP_; /* The global trace_marker can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { written = write_marker_to_buffer(tr, buf, cnt, ip); if (written < 0) break; } } else { written = write_marker_to_buffer(tr, buf, cnt, ip); } return written; } static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, const char *buf, size_t cnt) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; ssize_t written; size_t size; /* cnt includes both the entry->id and the data behind it. */ size = struct_size(entry, buf, cnt - sizeof(entry->id)); buffer = tr->array_buffer.buffer; if (size > ring_buffer_max_event_size(buffer)) return -EINVAL; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) /* Ring buffer disabled, return as if not open for write */ return -EBADF; entry = ring_buffer_event_data(event); unsafe_memcpy(&entry->id, buf, cnt, "id and content already reserved on ring buffer" "'buf' includes the 'id' and the data." "'entry' was allocated with cnt from 'id'."); written = cnt; __buffer_unlock_commit(buffer, event); return written; } static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; size_t size; char *buf; if (tracing_disabled) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; /* The marker must at least have a tag id */ if (cnt < sizeof(unsigned int)) return -EINVAL; /* Must have preemption disabled while having access to the buffer */ guard(preempt_notrace)(); buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); if (!buf) return -EFAULT; /* raw write is all or nothing */ if (cnt > size) return -EINVAL; /* The global trace_marker_raw can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { written = write_raw_marker_to_buffer(tr, buf, cnt); if (written < 0) break; } } else { written = write_raw_marker_to_buffer(tr, buf, cnt); } return written; } static int tracing_mark_open(struct inode *inode, struct file *filp) { int ret; ret = trace_user_fault_buffer_enable(); if (ret < 0) return ret; stream_open(inode, filp); ret = tracing_open_generic_tr(inode, filp); if (ret < 0) trace_user_fault_buffer_disable(); return ret; } static int tracing_mark_release(struct inode *inode, struct file *file) { trace_user_fault_buffer_disable(); return tracing_release_generic_tr(inode, file); } static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", i == tr->clock_id ? "[" : "", trace_clocks[i].name, i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; } int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; } if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; guard(mutex)(&trace_types_lock); tr->clock_id = i; ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { struct trace_scratch *tscratch = tr->scratch; tscratch->clock_id = i; } return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int ret; if (cnt >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; clockstr = strstrip(buf); ret = tracing_set_clock(tr, clockstr); if (ret) return ret; *fpos += cnt; return cnt; } static int tracing_clock_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); return 0; } static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) trace_array_put(tr); return ret; } u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) { if (rbe == this_cpu_read(trace_buffered_event)) return ring_buffer_time_stamp(buffer); return ring_buffer_event_time_stamp(buffer, rbe); } /* * Set or disable using the per CPU trace_buffer_event when possible. */ int tracing_set_filter_buffering(struct trace_array *tr, bool set) { guard(mutex)(&trace_types_lock); if (set && tr->no_filter_buffering_ref++) return 0; if (!set) { if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) return -EINVAL; --tr->no_filter_buffering_ref; } return 0; } struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; unsigned int spare_size; unsigned int read; }; #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); goto out; } ret = 0; iter->tr = tr; iter->array_buffer = &tr->max_buffer; iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } out: if (ret < 0) trace_array_put(tr); return ret; } static void tracing_swap_cpu_buffer(void *tr) { update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); } static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_iterator *iter = m->private; struct trace_array *tr = iter->tr; unsigned long val; int ret; ret = tracing_update_buffers(tr); if (ret < 0) return ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; guard(mutex)(&trace_types_lock); if (tr->current_trace->use_max_tr) return -EBUSY; local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) return ret; switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP if (iter->cpu_file != RING_BUFFER_ALL_CPUS) return -EINVAL; #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); update_max_tr(tr, current, smp_processor_id(), NULL); local_irq_enable(); } else { smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } if (ret >= 0) { *ppos += cnt; ret = cnt; } return ret; } static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; int ret; ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) return ret; /* If write only, the seq_file is just a stub */ if (m) kfree(m->private); kfree(m); return 0; } static int tracing_buffers_open(struct inode *inode, struct file *filp); static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos); static int tracing_buffers_release(struct inode *inode, struct file *file); static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static int snapshot_raw_open(struct inode *inode, struct file *filp) { struct ftrace_buffer_info *info; int ret; /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; info = filp->private_data; if (info->iter.trace->use_max_tr) { tracing_buffers_release(inode, filp); return -EBUSY; } info->iter.snapshot = true; info->iter.array_buffer = &info->iter.tr->max_buffer; return ret; } #endif /* CONFIG_TRACER_SNAPSHOT */ static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, .write = tracing_thresh_write, .llseek = generic_file_llseek, }; #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, .poll = tracing_poll_pipe, .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_buffer_meta_fops = { .open = tracing_buffer_meta_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_seq_release, }; static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, .release = tracing_mark_release, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, .write = tracing_clock_write, }; static const struct file_operations trace_time_stamp_mode_fops = { .open = tracing_time_stamp_mode_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_single_release_tr, }; static const struct file_operations last_boot_fops = { .open = tracing_last_boot_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_seq_release, }; #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_lseek, .release = tracing_snapshot_release, }; static const struct file_operations snapshot_raw_fops = { .open = snapshot_raw_open, .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, }; #endif /* CONFIG_TRACER_SNAPSHOT */ /* * trace_min_max_write - Write a u64 value to a trace_min_max_param struct * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into * @cnt: The maximum number of bytes to read * @ppos: The current "file" position * * This function implements the write interface for a struct trace_min_max_param. * The filp->private_data must point to a trace_min_max_param structure that * defines where to write the value, the min and the max acceptable values, * and a lock to protect the write. */ static ssize_t trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_min_max_param *param = filp->private_data; u64 val; int err; if (!param) return -EFAULT; err = kstrtoull_from_user(ubuf, cnt, 10, &val); if (err) return err; if (param->lock) mutex_lock(param->lock); if (param->min && val < *param->min) err = -EINVAL; if (param->max && val > *param->max) err = -EINVAL; if (!err) *param->val = val; if (param->lock) mutex_unlock(param->lock); if (err) return err; return cnt; } /* * trace_min_max_read - Read a u64 value from a trace_min_max_param struct * @filp: The active open file structure * @ubuf: The userspace provided buffer to read value into * @cnt: The maximum number of bytes to read * @ppos: The current "file" position * * This function implements the read interface for a struct trace_min_max_param. * The filp->private_data must point to a trace_min_max_param struct with valid * data. */ static ssize_t trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_min_max_param *param = filp->private_data; char buf[U64_STR_SIZE]; int len; u64 val; if (!param) return -EFAULT; val = *param->val; if (cnt > sizeof(buf)) cnt = sizeof(buf); len = snprintf(buf, sizeof(buf), "%llu\n", val); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } const struct file_operations trace_min_max_fops = { .open = tracing_open_generic, .read = trace_min_max_read, .write = trace_min_max_write, }; #define TRACING_LOG_ERRS_MAX 8 #define TRACING_LOG_LOC_MAX 128 #define CMD_PREFIX " Command: " struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ u16 pos; /* caret position */ u64 ts; }; struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ char *cmd; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); static struct tracing_log_err *alloc_tracing_log_err(int len) { struct tracing_log_err *err; err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) return ERR_PTR(-ENOMEM); err->cmd = kzalloc(len, GFP_KERNEL); if (!err->cmd) { kfree(err); return ERR_PTR(-ENOMEM); } return err; } static void free_tracing_log_err(struct tracing_log_err *err) { kfree(err->cmd); kfree(err); } static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, int len) { struct tracing_log_err *err; char *cmd; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = alloc_tracing_log_err(len); if (PTR_ERR(err) != -ENOMEM) tr->n_err_log_entries++; return err; } cmd = kzalloc(len, GFP_KERNEL); if (!cmd) return ERR_PTR(-ENOMEM); err = list_first_entry(&tr->err_log, struct tracing_log_err, list); kfree(err->cmd); err->cmd = cmd; list_del(&err->list); return err; } /** * err_pos - find the position of a string within a command for error careting * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) { char *found; if (WARN_ON(!strlen(cmd))) return 0; found = strstr(cmd, str); if (found) return found - cmd; return 0; } /** * tracing_log_err - write an error to the tracing error log * @tr: The associated trace array for the error (NULL for top level array) * @loc: A string describing where the error occurred * @cmd: The tracing command that caused the error * @errs: The array of loc-specific static error strings * @type: The index into errs[], which produces the specific static err string * @pos: The position the caret should be placed in the cmd * * Writes an error into tracing/error_log of the form: * * <loc>: error: <text> * Command: <cmd> * ^ * * tracing/error_log is a small log file containing the last * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated * unless there has been a tracing error, and the error log can be * cleared and have its memory freed by writing the empty string in * truncation mode to it i.e. echo > tracing/error_log. * * NOTE: the @errs array along with the @type param are used to * produce a static error string - this string is not copied and saved * when the error is logged - only a pointer to it is saved. See * existing callers for examples of how static strings are typically * defined for use with tracing_log_err(). */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, const char **errs, u8 type, u16 pos) { struct tracing_log_err *err; int len = 0; if (!tr) tr = &global_trace; len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; guard(mutex)(&tracing_err_log_lock); err = get_tracing_log_err(tr, len); if (PTR_ERR(err) == -ENOMEM) return; snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; err->info.pos = pos; err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); } static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; guard(mutex)(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) { struct trace_array *tr = m->private; mutex_lock(&tracing_err_log_lock); return seq_list_start(&tr->err_log, *pos); } static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; return seq_list_next(v, &tr->err_log, pos); } static void tracing_err_log_seq_stop(struct seq_file *m, void *v) { mutex_unlock(&tracing_err_log_lock); } static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) { u16 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); for (i = 0; i < pos; i++) seq_putc(m, ' '); seq_puts(m, "^\n"); } static int tracing_err_log_seq_show(struct seq_file *m, void *v) { struct tracing_log_err *err = v; if (err) { const char *err_text = err->info.errs[err->info.type]; u64 sec = err->info.ts; u32 nsec; nsec = do_div(sec, NSEC_PER_SEC); seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, err->loc, err_text); seq_printf(m, "%s", err->cmd); tracing_err_log_show_pos(m, err->info.pos); } return 0; } static const struct seq_operations tracing_err_log_seq_ops = { .start = tracing_err_log_seq_start, .next = tracing_err_log_seq_next, .stop = tracing_err_log_seq_stop, .show = tracing_err_log_seq_show }; static int tracing_err_log_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; int ret = 0; ret = tracing_check_open_get_tr(tr); if (ret) return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) clear_tracing_err_log(tr); if (file->f_mode & FMODE_READ) { ret = seq_open(file, &tracing_err_log_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = tr; } else { trace_array_put(tr); } } return ret; } static ssize_t tracing_err_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return count; } static int tracing_err_log_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); if (file->f_mode & FMODE_READ) seq_release(inode, file); return 0; } static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, .llseek = tracing_lseek, .release = tracing_err_log_release, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; } mutex_lock(&trace_types_lock); info->iter.tr = tr; info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; info->iter.array_buffer = &tr->array_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; filp->private_data = info; tr->trace_ref++; mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); if (ret < 0) trace_array_put(tr); return ret; } static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; return trace_poll(iter, filp, poll_table); } static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; void *trace_data; int page_size; ssize_t ret = 0; ssize_t size; if (!count) return 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); /* Make sure the spare matches the current sub buffer size */ if (info->spare) { if (page_size != info->spare_size) { ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); info->spare = NULL; } } if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); if (IS_ERR(info->spare)) { ret = PTR_ERR(info->spare); info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; info->spare_size = page_size; } } if (!info->spare) return ret; /* Do we have previous read data to read? */ if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); if (ret < 0) { if (trace_empty(iter) && !iter->closed) { if (update_last_data_if_empty(iter->tr)) return 0; if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; ret = wait_on_pipe(iter, 0); if (ret) return ret; goto again; } return 0; } info->read = 0; read: size = page_size - info->read; if (size > count) size = count; trace_data = ring_buffer_read_page_data(info->spare); ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; size -= ret; *ppos += size; info->read += size; return size; } static int tracing_buffers_flush(struct file *file, fl_owner_t id) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; iter->closed = true; /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); return 0; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); kvfree(info); return 0; } struct buffer_ref { struct trace_buffer *buffer; void *page; int cpu; refcount_t refcount; }; static void buffer_ref_release(struct buffer_ref *ref) { if (!refcount_dec_and_test(&ref->refcount)) return; ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); } static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; buffer_ref_release(ref); buf->private = 0; } static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; if (refcount_read(&ref->refcount) > INT_MAX/2) return false; refcount_inc(&ref->refcount); return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .release = buffer_pipe_buf_release, .get = buffer_pipe_buf_get, }; /* * Callback from splice_to_pipe(), if we need to release some pages * at the end of the spd in case we error'ed out in filling the pipe. */ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) { struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; buffer_ref_release(ref); spd->partial[i].private = 0; } static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; bool woken = false; int page_size; int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->tr->current_trace->use_max_tr) return -EBUSY; #endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); if (*ppos & (page_size - 1)) return -EINVAL; if (len & (page_size - 1)) { if (len < page_size) return -EINVAL; len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) return -ENOMEM; again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) { ret = -ENOMEM; break; } refcount_set(&ref->refcount, 1); ref->buffer = iter->array_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { ret = PTR_ERR(ref->page); ref->page = NULL; kfree(ref); break; } ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); break; } page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { if (ret) goto out; if (woken) goto out; ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; /* No need to wait after waking up when tracing is off */ if (!tracer_tracing_is_on(iter->tr)) goto out; /* Iterate one more time to collect any new data then exit */ woken = true; goto again; } ret = splice_to_pipe(pipe, &spd); out: splice_shrink_spd(&spd); return ret; } static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; int err; if (cmd == TRACE_MMAP_IOCTL_GET_READER) { if (!(file->f_flags & O_NONBLOCK)) { err = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, iter->tr->buffer_percent, NULL, NULL); if (err) return err; } return ring_buffer_map_get_reader(iter->array_buffer->buffer, iter->cpu_file); } else if (cmd) { return -ENOTTY; } /* * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); return 0; } #ifdef CONFIG_TRACER_MAX_TRACE static int get_snapshot_map(struct trace_array *tr) { int err = 0; /* * Called with mmap_lock held. lockdep would be unhappy if we would now * take trace_types_lock. Instead use the specific * snapshot_trigger_lock. */ spin_lock(&tr->snapshot_trigger_lock); if (tr->snapshot || tr->mapped == UINT_MAX) err = -EBUSY; else tr->mapped++; spin_unlock(&tr->snapshot_trigger_lock); /* Wait for update_max_tr() to observe iter->tr->mapped */ if (tr->mapped == 1) synchronize_rcu(); return err; } static void put_snapshot_map(struct trace_array *tr) { spin_lock(&tr->snapshot_trigger_lock); if (!WARN_ON(!tr->mapped)) tr->mapped--; spin_unlock(&tr->snapshot_trigger_lock); } #else static inline int get_snapshot_map(struct trace_array *tr) { return 0; } static inline void put_snapshot_map(struct trace_array *tr) { } #endif static void tracing_buffers_mmap_close(struct vm_area_struct *vma) { struct ftrace_buffer_info *info = vma->vm_file->private_data; struct trace_iterator *iter = &info->iter; WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); put_snapshot_map(iter->tr); } static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) { /* * Trace buffer mappings require the complete buffer including * the meta page. Partial mappings are not supported. */ return -EINVAL; } static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; int ret = 0; /* A memmap'ed buffer is not supported for user space mmap */ if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) return -ENODEV; ret = get_snapshot_map(iter->tr); if (ret) return ret; ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); if (ret) put_snapshot_map(iter->tr); vma->vm_ops = &tracing_buffers_vmops; return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .mmap = tracing_buffers_mmap, }; static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(filp); struct trace_array *tr = inode->i_private; struct array_buffer *trace_buf = &tr->array_buffer; int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, trace_seq_used(s)); kfree(s); return count; } static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { ssize_t ret; char *buf; int r; /* 512 should be plenty to hold the amount needed */ #define DYN_INFO_BUF_SIZE 512 buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; r = scnprintf(buf, DYN_INFO_BUF_SIZE, "%ld pages:%ld groups: %ld\n" "ftrace boot update time = %llu (ns)\n" "ftrace module total update time = %llu (ns)\n", ftrace_update_tot_cnt, ftrace_number_of_pages, ftrace_number_of_groups, ftrace_update_time, ftrace_total_mod_time); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); return ret; } static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; #endif /* CONFIG_DYNAMIC_FTRACE */ #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void ftrace_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { tracing_snapshot_instance(tr); } static void ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) { if (*count <= 0) return; (*count)--; } tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); if (mapper) count = (long *)ftrace_func_mapper_find_ip(mapper, ip); if (count) seq_printf(m, ":count=%ld\n", *count); else seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENOMEM; *data = mapper; } return ftrace_func_mapper_add_ip(mapper, ip, init_data); } static void ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, NULL); return; } ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, }; static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, .init = ftrace_snapshot_init, .free = ftrace_snapshot_free, }; static int ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; if (glob[0] == '!') { ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!ret) tracing_disarm_snapshot(tr); return ret; } if (!param) goto out_reg; number = strsep(&param, ":"); if (!strlen(number)) goto out_reg; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_snapshot_cmd = { .name = "snapshot", .func = ftrace_trace_snapshot_callback, }; static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ static struct dentry *tracing_get_dentry(struct trace_array *tr) { /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; if (WARN_ON(!tr->dir)) return ERR_PTR(-ENODEV); /* All sub buffers have a descriptor */ return tr->dir; } static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; if (tr->percpu_dir) return tr->percpu_dir; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); MEM_FAIL(!tr->percpu_dir, "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } static struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { struct dentry *ret = trace_create_file(name, mode, parent, data, fops); if (ret) /* See tracing_get_cpu() */ d_inode(ret)->i_cdev = (void *)(cpu + 1); return ret; } static void tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ if (!d_percpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } /* per cpu trace_pipe */ trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops); trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops); trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); if (tr->range_addr_start) trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT if (!tr->range_addr_start) { trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops); trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); } #endif } #ifdef CONFIG_FTRACE_SELFTEST /* Let selftest have access to static functions in this file */ #include "trace_selftest.c" #endif static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; char *buf; if (topt->flags->val & topt->opt->bit) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_option_dentry *topt = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); if (ret) return ret; } *ppos += cnt; return cnt; } static int tracing_open_options(struct inode *inode, struct file *filp) { struct trace_option_dentry *topt = inode->i_private; int ret; ret = tracing_check_open_get_tr(topt->tr); if (ret) return ret; filp->private_data = inode->i_private; return 0; } static int tracing_release_options(struct inode *inode, struct file *file) { struct trace_option_dentry *topt = file->private_data; trace_array_put(topt->tr); return 0; } static const struct file_operations trace_options_fops = { .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, .release = tracing_release_options, }; /* * In order to pass in both the trace_array descriptor as well as the index * to the flag that the trace option file represents, the trace_array * has a character array of trace_flags_index[], which holds the index * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. * The address of this character array is passed to the flag option file * read/write callbacks. * * In order to extract both the index and the trace_array descriptor, * get_tr_index() uses the following algorithm. * * idx = *ptr; * * As the pointer itself contains the address of the index (remember * index[1] == 1). * * Then to get the trace_array descriptor, by subtracting that index * from the ptr, we get to the start of the index itself. * * ptr - idx == &index[0] * * Then a simple container_of() from that pointer gets us to the * trace_array descriptor. */ static void get_tr_index(void *data, struct trace_array **ptr, unsigned int *pindex) { *pindex = *(unsigned char *)data; *ptr = container_of(data - *pindex, struct trace_array, trace_flags_index); } static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; char *buf; get_tr_index(tr_index, &tr, &index); if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); } static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { void *tr_index = filp->private_data; struct trace_array *tr; unsigned int index; unsigned long val; int ret; get_tr_index(tr_index, &tr, &index); ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val != 0 && val != 1) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (ret < 0) return ret; *ppos += cnt; return cnt; } static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { struct dentry *ret; ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; if (tr->options) return tr->options; d_tracer = tracing_get_dentry(tr); if (IS_ERR(d_tracer)) return NULL; tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } return tr->options; } static void create_trace_option_file(struct trace_array *tr, struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; topt->tr = tr; topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, t_options, topt, &trace_options_fops); } static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; int i; if (!tracer) return; flags = tracer->flags; if (!flags || !flags->opts) return; /* * If this is an instance, only create flags for tracers * the instance may have. */ if (!trace_ok_for_array(tracer, tr)) return; for (i = 0; i < tr->nr_topts; i++) { /* Make sure there's no duplicate flags. */ if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } opts = flags->opts; for (cnt = 0; opts[cnt].name; cnt++) ; topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) return; tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), GFP_KERNEL); if (!tr_topts) { kfree(topts); return; } tr->topts = tr_topts; tr->topts[tr->nr_topts].tracer = tracer; tr->topts[tr->nr_topts].topts = topts; tr->nr_topts++; for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); MEM_FAIL(topts[cnt].entry == NULL, "Failed to create trace option: %s", opts[cnt].name); } } static struct dentry * create_trace_option_core_file(struct trace_array *tr, const char *option, long index) { struct dentry *t_options; t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) { if (top_level || !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) create_trace_option_core_file(tr, trace_options[i], i); } } static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (buffer) { guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } } (*ppos)++; return cnt; } static const struct file_operations rb_simple_fops = { .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static ssize_t buffer_percent_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; char buf[64]; int r; r = tr->buffer_percent; r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t buffer_percent_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; if (val > 100) return -EINVAL; tr->buffer_percent = val; (*ppos)++; return cnt; } static const struct file_operations buffer_percent_fops = { .open = tracing_open_generic_tr, .read = buffer_percent_read, .write = buffer_percent_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static ssize_t buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; size_t size; char buf[64]; int order; int r; order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); size = (PAGE_SIZE << order) / 1024; r = sprintf(buf, "%zd\n", size); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int old_order; int order; int pages; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; val *= 1024; /* value passed in is in KB */ pages = DIV_ROUND_UP(val, PAGE_SIZE); order = fls(pages - 1); /* limit between 1 and 128 system pages */ if (order < 0 || order > 7) return -EINVAL; /* Do not allow tracing while changing the order of the ring buffer */ tracing_stop_tr(tr); old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); if (old_order == order) goto out; ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); if (ret) goto out; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->allocated_snapshot) goto out_max; ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); if (WARN_ON_ONCE(cnt)) { /* * AARGH! We are left with different orders! * The max buffer is our "snapshot" buffer. * When a tracer needs a snapshot (one of the * latency tracers), it swaps the max buffer * with the saved snap shot. We succeeded to * update the order of the main buffer, but failed to * update the order of the max buffer. But when we tried * to reset the main buffer to the original size, we * failed there too. This is very unlikely to * happen, but if it does, warn and kill all * tracing. */ tracing_disabled = 1; } goto out; } out_max: #endif (*ppos)++; out: if (ret) cnt = ret; tracing_start_tr(tr); return cnt; } static const struct file_operations buffer_subbuf_size_fops = { .open = tracing_open_generic_tr, .read = buffer_subbuf_size_read, .write = buffer_subbuf_size_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); #ifdef CONFIG_MODULES static int make_mod_delta(struct module *mod, void *data) { struct trace_module_delta *module_delta; struct trace_scratch *tscratch; struct trace_mod_entry *entry; struct trace_array *tr = data; int i; tscratch = tr->scratch; module_delta = READ_ONCE(tr->module_delta); for (i = 0; i < tscratch->nr_entries; i++) { entry = &tscratch->entries[i]; if (strcmp(mod->name, entry->mod_name)) continue; if (mod->state == MODULE_STATE_GOING) module_delta->delta[i] = 0; else module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base - entry->mod_addr; break; } return 0; } #else static int make_mod_delta(struct module *mod, void *data) { return 0; } #endif static int mod_addr_comp(const void *a, const void *b, const void *data) { const struct trace_mod_entry *e1 = a; const struct trace_mod_entry *e2 = b; return e1->mod_addr > e2->mod_addr ? 1 : -1; } static void setup_trace_scratch(struct trace_array *tr, struct trace_scratch *tscratch, unsigned int size) { struct trace_module_delta *module_delta; struct trace_mod_entry *entry; int i, nr_entries; if (!tscratch) return; tr->scratch = tscratch; tr->scratch_size = size; if (tscratch->text_addr) tr->text_delta = (unsigned long)_text - tscratch->text_addr; if (struct_size(tscratch, entries, tscratch->nr_entries) > size) goto reset; /* Check if each module name is a valid string */ for (i = 0; i < tscratch->nr_entries; i++) { int n; entry = &tscratch->entries[i]; for (n = 0; n < MODULE_NAME_LEN; n++) { if (entry->mod_name[n] == '\0') break; if (!isprint(entry->mod_name[n])) goto reset; } if (n == MODULE_NAME_LEN) goto reset; } /* Sort the entries so that we can find appropriate module from address. */ nr_entries = tscratch->nr_entries; sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), mod_addr_comp, NULL, NULL); if (IS_ENABLED(CONFIG_MODULES)) { module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); if (!module_delta) { pr_info("module_delta allocation failed. Not able to decode module address."); goto reset; } init_rcu_head(&module_delta->rcu); } else module_delta = NULL; WRITE_ONCE(tr->module_delta, module_delta); /* Scan modules to make text delta for modules. */ module_for_each_mod(make_mod_delta, tr); /* Set trace_clock as the same of the previous boot. */ if (tscratch->clock_id != tr->clock_id) { if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { pr_info("the previous trace_clock info is not valid."); goto reset; } } return; reset: /* Invalid trace modules */ memset(tscratch, 0, size); } static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; struct trace_scratch *tscratch; unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, tr->range_addr_size, struct_size(tscratch, entries, 128)); tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); setup_trace_scratch(tr, tscratch, scratch_size); /* * This is basically the same as a mapped buffer, * with the same restrictions. */ tr->mapped++; } else { buf->buffer = ring_buffer_alloc(size, rb_flags); } if (!buf->buffer) return -ENOMEM; buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); buf->buffer = NULL; return -ENOMEM; } /* Allocate the first page for all buffers */ set_buffer_entries(&tr->array_buffer, ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } static void free_trace_buffer(struct array_buffer *buf) { if (buf->buffer) { ring_buffer_free(buf->buffer); buf->buffer = NULL; free_percpu(buf->data); buf->data = NULL; } } static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; ret = allocate_trace_buffer(tr, &tr->array_buffer, size); if (ret) return ret; #ifdef CONFIG_TRACER_MAX_TRACE /* Fix mapped buffer trace arrays do not have snapshot buffers */ if (tr->range_addr_start) return 0; ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; allocate_snapshot = false; #endif return 0; } static void free_trace_buffers(struct trace_array *tr) { if (!tr) return; free_trace_buffer(&tr->array_buffer); kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); #endif } static void init_trace_flags_index(struct trace_array *tr) { int i; /* Used by the trace options files */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) tr->trace_flags_index[i] = i; } static void __update_tracer_options(struct trace_array *tr) { struct tracer *t; for (t = trace_types; t; t = t->next) add_tracer_options(tr, t); } static void update_tracer_options(struct trace_array *tr) { guard(mutex)(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); } /* Must have trace_types_lock held */ struct trace_array *trace_array_find(const char *instance) { struct trace_array *tr, *found = NULL; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, instance) == 0) { found = tr; break; } } return found; } struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; return tr; } static int trace_array_create_dir(struct trace_array *tr) { int ret; tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); if (!tr->dir) return -EINVAL; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { tracefs_remove(tr->dir); return ret; } init_tracer_tracefs(tr, tr->dir); __update_tracer_options(tr); return ret; } static struct trace_array * trace_array_create_systems(const char *name, const char *systems, unsigned long range_addr_start, unsigned long range_addr_size) { struct trace_array *tr; int ret; ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) goto out_free_tr; if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; if (systems) { tr->system_names = kstrdup_const(systems, GFP_KERNEL); if (!tr->system_names) goto out_free_tr; } /* Only for boot up memory mapped ring buffers */ tr->range_addr_start = range_addr_start; tr->range_addr_size = range_addr_size; tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); #endif if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; /* The ring buffer is defaultly expanded */ trace_set_ring_buffer_expanded(tr); if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; ftrace_init_trace_array(tr); init_trace_flags_index(tr); if (trace_instance_dir) { ret = trace_array_create_dir(tr); if (ret) goto out_free_tr; } else __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; return tr; out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); kfree(tr->range_name); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } static struct trace_array *trace_array_create(const char *name) { return trace_array_create_systems(name, NULL, 0, 0); } static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); return ret; } #ifdef CONFIG_MMU static u64 map_pages(unsigned long start, unsigned long size) { unsigned long vmap_start, vmap_end; struct vm_struct *area; int ret; area = get_vm_area(size, VM_IOREMAP); if (!area) return 0; vmap_start = (unsigned long) area->addr; vmap_end = vmap_start + size; ret = vmap_page_range(vmap_start, vmap_end, start, pgprot_nx(PAGE_KERNEL)); if (ret < 0) { free_vm_area(area); return 0; } return (u64)vmap_start; } #else static inline u64 map_pages(unsigned long start, unsigned long size) { return 0; } #endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. * * NOTE: This function increments the reference counter associated with the * trace array returned. This makes sure it cannot be freed while in use. * Use trace_array_put() once the trace array is no longer needed. * If the trace_array is to be freed, trace_array_destroy() needs to * be called after the trace_array_put(), or simply let user space delete * it from the tracefs instances directory. But until the * trace_array_put() is called, user space can not delete it. * */ struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) { tr->ref++; return tr; } } tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; else tr->ref++; return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; /* Reference counter for a newly created trace array = 1. */ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { if ((1 << i) & ZEROED_TRACE_FLAGS) set_tracer_flag(tr, 1 << i, 0); } if (printk_trace == tr) update_printk_trace(&global_trace); if (update_marker_trace(tr, 0)) synchronize_rcu(); tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); free_percpu(tr->last_func_repeats); free_trace_buffers(tr); clear_tracing_err_log(tr); if (tr->range_name) { reserve_mem_release_by_name(tr->range_name); kfree(tr->range_name); } for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } kfree(tr->topts); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return 0; } int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; if (!this_tr) return -EINVAL; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) return __remove_instance(tr); } return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); tr = trace_array_find(name); if (!tr) return -ENODEV; return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) { struct trace_array *tr; trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; guard(mutex)(&event_mutex); guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) return; } } static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops); trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops); trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops); trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops); trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops); trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &buffer_subbuf_size_fops); create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); if (tr->range_addr_start) { trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT } else { trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); ftrace_init_tracefs(tr, d_tracer); } #ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; struct fs_context *fc; int ret; /* * To maintain backward compatibility for tools that mount * debugfs to get to the tracing facility, tracefs is automatically * mounted to the debugfs/tracing directory. */ type = get_fs_type("tracefs"); if (!type) return NULL; fc = fs_context_for_submount(type, mntpt); put_filesystem(type); if (IS_ERR(fc)) return ERR_CAST(fc); pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); ret = vfs_parse_fs_string(fc, "source", "tracefs"); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } #endif /** * tracing_init_dentry - initialize top level trace array * * This is called when creating files or directories in the tracing * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) return 0; if (WARN_ON(!tracefs_initialized())) return -ENODEV; #ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); #endif return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) { INIT_WORK(&eval_map_work, eval_map_work_func); eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); if (!eval_map_wq) { pr_err("Unable to allocate eval_map_wq\n"); /* Do work here */ eval_map_work_func(&eval_map_work); return -ENOMEM; } queue_work(eval_map_wq, &eval_map_work); return 0; } subsys_initcall(trace_eval_init); static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ if (eval_map_wq) destroy_workqueue(eval_map_wq); return 0; } late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); if (n > sizeof(modname) - 1) return false; val = module_kallsyms_lookup_name(modname); return val != 0; } static void trace_module_add_evals(struct module *mod) { /* * Modules with bad taint do not have events created, do * not bother with enums either. */ if (trace_module_has_bad_taint(mod)) return; /* Even if no trace_evals, this need to sanitize field types. */ trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; while (map) { if (map->head.mod == mod) break; map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); } #else static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_module_record(struct module *mod, bool add) { struct trace_array *tr; unsigned long flags; list_for_each_entry(tr, &ftrace_trace_arrays, list) { flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); /* Update any persistent trace array that has already been started */ if (flags == TRACE_ARRAY_FL_BOOT && add) { guard(mutex)(&scratch_mutex); save_mod(mod, tr); } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { /* Update delta if the module loaded in previous boot */ make_mod_delta(mod, tr); } } } static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); trace_module_record(mod, false); break; } return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; #endif /* CONFIG_MODULES */ static __init void tracer_init_tracefs_work_func(struct work_struct *work) { event_trace_init(); init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops); trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops); trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops); trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops); trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif create_trace_instances(NULL); update_tracer_options(&global_trace); } static __init int tracer_init_tracefs(void) { int ret; trace_access_lock_init(); ret = tracing_init_dentry(); if (ret) return 0; if (eval_map_wq) { INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); queue_work(eval_map_wq, &tracerfs_init_work); } else { tracer_init_tracefs_work_func(NULL); } rv_init_interface(); return 0; } fs_initcall(tracer_init_tracefs); static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused); static struct notifier_block trace_panic_notifier = { .notifier_call = trace_die_panic_handler, .priority = INT_MAX - 1, }; static struct notifier_block trace_die_notifier = { .notifier_call = trace_die_panic_handler, .priority = INT_MAX - 1, }; /* * The idea is to execute the following die/panic callback early, in order * to avoid showing irrelevant information in the trace (like other panic * notifier functions); we are the 2nd to run, after hung_task/rcu_stall * warnings get disabled (to prevent potential log flooding). */ static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } /* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ #define TRACE_MAX_PRINT 1000 /* * Define here KERN_TRACE so that we have one place to modify * it if we decide to change what log level the ftrace dump * should be at. */ #define KERN_TRACE KERN_EMERG void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ if (s->seq.len >= TRACE_MAX_PRINT) s->seq.len = TRACE_MAX_PRINT; /* * More paranoid code. Although the buffer size is set to * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just * an extra layer of protection. */ if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); trace_seq_init(s); } static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* Can not use kmalloc for iter.temp and iter.fmt */ iter->temp = static_temp_buf; iter->temp_size = STATIC_TEMP_BUF_SIZE; iter->fmt = static_fmt_buf; iter->fmt_size = STATIC_FMT_BUF_SIZE; } void trace_init_global_iter(struct trace_iterator *iter) { trace_init_iter(iter, &global_trace); } static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; int cnt = 0; /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens * between multiple crashes. * * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ trace_init_iter(&iter, tr); /* While dumping, do not allow the buffer to be enable */ tracer_tracing_disable(tr); old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); else iter.cpu_file = RING_BUFFER_ALL_CPUS; if (tr == &global_trace) printk(KERN_TRACE "Dumping ftrace buffer:\n"); else printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); printk("# MAY BE MISSING FUNCTION EVENTS\n"); } /* * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. */ while (!trace_empty(&iter)) { if (!cnt) printk(KERN_TRACE "---------------------------------\n"); cnt++; trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); } if (!cnt) printk(KERN_TRACE " (ftrace buffer empty)\n"); else printk(KERN_TRACE "---------------------------------\n"); tr->trace_flags |= old_userobj; tracer_tracing_enable(tr); local_irq_restore(flags); } static void ftrace_dump_by_param(void) { bool first_param = true; char dump_param[MAX_TRACER_SIZE]; char *buf, *token, *inst_name; struct trace_array *tr; strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); buf = dump_param; while ((token = strsep(&buf, ",")) != NULL) { if (first_param) { first_param = false; if (!strcmp("0", token)) continue; else if (!strcmp("1", token)) { ftrace_dump_one(&global_trace, DUMP_ALL); continue; } else if (!strcmp("2", token) || !strcmp("orig_cpu", token)) { ftrace_dump_one(&global_trace, DUMP_ORIG); continue; } } inst_name = strsep(&token, "="); tr = trace_array_find(inst_name); if (!tr) { printk(KERN_TRACE "Instance %s not found\n", inst_name); continue; } if (token && (!strcmp("2", token) || !strcmp("orig_cpu", token))) ftrace_dump_one(tr, DUMP_ORIG); else ftrace_dump_one(tr, DUMP_ALL); } } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { static atomic_t dump_running; /* Only allow one dump user at a time. */ if (atomic_inc_return(&dump_running) != 1) { atomic_dec(&dump_running); return; } switch (oops_dump_mode) { case DUMP_ALL: ftrace_dump_one(&global_trace, DUMP_ALL); break; case DUMP_ORIG: ftrace_dump_one(&global_trace, DUMP_ORIG); break; case DUMP_PARAM: ftrace_dump_by_param(); break; case DUMP_NONE: break; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); ftrace_dump_one(&global_trace, DUMP_ALL); } atomic_dec(&dump_running); } EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { char *kbuf __free(kfree) = NULL; char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); if (!kbuf) return -ENOMEM; while (done < count) { size = count - done; if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; if (copy_from_user(kbuf, buffer + done, size)) return -EFAULT; kbuf[size] = '\0'; buf = kbuf; do { tmp = strchr(buf, '\n'); if (tmp) { *tmp = '\0'; size = tmp - buf + 1; } else { size = strlen(buf); if (done + size < count) { if (buf != kbuf) break; /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); return -EINVAL; } } done += size; /* Remove comments */ tmp = strchr(buf, '#'); if (tmp) *tmp = '\0'; ret = createfn(buf); if (ret) return ret; buf += size; } while (done < count); } return done; } #ifdef CONFIG_TRACER_MAX_TRACE __init static bool tr_needs_alloc_snapshot(const char *name) { char *test; int len = strlen(name); bool ret; if (!boot_snapshot_index) return false; if (strncmp(name, boot_snapshot_info, len) == 0 && boot_snapshot_info[len] == '\t') return true; test = kmalloc(strlen(name) + 3, GFP_KERNEL); if (!test) return false; sprintf(test, "\t%s\t", name); ret = strstr(boot_snapshot_info, test) == NULL; kfree(test); return ret; } __init static void do_allocate_snapshot(const char *name) { if (!tr_needs_alloc_snapshot(name)) return; /* * When allocate_snapshot is set, the next call to * allocate_trace_buffers() (called by trace_array_get_by_name()) * will allocate the snapshot buffer. That will alse clear * this flag. */ allocate_snapshot = true; } #else static inline void do_allocate_snapshot(const char *name) { } #endif __init static void enable_instances(void) { struct trace_array *tr; bool memmap_area = false; char *curr_str; char *name; char *str; char *tok; /* A tab is always appended */ boot_instance_info[boot_instance_index - 1] = '\0'; str = boot_instance_info; while ((curr_str = strsep(&str, "\t"))) { phys_addr_t start = 0; phys_addr_t size = 0; unsigned long addr = 0; bool traceprintk = false; bool traceoff = false; char *flag_delim; char *addr_delim; char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); flag_delim = strchr(tok, '^'); addr_delim = strchr(tok, '@'); if (addr_delim) *addr_delim++ = '\0'; if (flag_delim) *flag_delim++ = '\0'; name = tok; if (flag_delim) { char *flag; while ((flag = strsep(&flag_delim, "^"))) { if (strcmp(flag, "traceoff") == 0) { traceoff = true; } else if ((strcmp(flag, "printk") == 0) || (strcmp(flag, "traceprintk") == 0) || (strcmp(flag, "trace_printk") == 0)) { traceprintk = true; } else { pr_info("Tracing: Invalid instance flag '%s' for %s\n", flag, name); } } } tok = addr_delim; if (tok && isdigit(*tok)) { start = memparse(tok, &tok); if (!start) { pr_warn("Tracing: Invalid boot instance address for %s\n", name); continue; } if (*tok != ':') { pr_warn("Tracing: No size specified for instance %s\n", name); continue; } tok++; size = memparse(tok, &tok); if (!size) { pr_warn("Tracing: Invalid boot instance size for %s\n", name); continue; } memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } rname = kstrdup(tok, GFP_KERNEL); } if (start) { /* Start and size must be page aligned */ if (start & ~PAGE_MASK) { pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); continue; } if (size & ~PAGE_MASK) { pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); continue; } if (memmap_area) addr = map_pages(start, size); else addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); } else { pr_warn("Tracing: Failed to map boot instance %s\n", name); continue; } } else { /* Only non mapped buffers have snapshot buffers */ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(name); } tr = trace_array_create_systems(name, NULL, addr, size); if (IS_ERR(tr)) { pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); continue; } if (traceoff) tracer_tracing_off(tr); if (traceprintk) update_printk_trace(tr); /* * memmap'd buffers can not be freed. */ if (memmap_area) { tr->flags |= TRACE_ARRAY_FL_MEMMAP; tr->ref++; } if (start) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } } } __init static int tracer_alloc_buffers(void) { int ring_buf_size; int ret = -ENOMEM; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } /* * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); /* * The prepare callbacks allocates some memory for the ring buffer. We * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) goto out_free_savedcmd; /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); goto out_free_pipe_cpumask; } if (global_trace.buffer_disabled) tracing_off(); if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) pr_warn("Trace clock %s not defined, going back to default\n", trace_boot_clock); } /* * register_tracer() might reference current_trace, so it * needs to be set before we register anything. This is * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE spin_lock_init(&global_trace.snapshot_trigger_lock); #endif ftrace_init_global_array_ops(&global_trace); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&global_trace.mod_events); #endif init_trace_flags_index(&global_trace); register_tracer(&nop_trace); /* Function tracing may start here (via kernel command line) */ init_function_trace(); /* All seems OK, enable tracing */ tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, &trace_panic_notifier); register_die_notifier(&trace_die_notifier); global_trace.flags = TRACE_ARRAY_FL_GLOBAL; INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); register_snapshot_cmd(); return 0; out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); return ret; } #ifdef CONFIG_FUNCTION_TRACER /* Used to set module cached ftrace filtering at boot up */ __init struct trace_array *trace_get_global_array(void) { return &global_trace; } #endif void __init ftrace_boot_snapshot(void) { #ifdef CONFIG_TRACER_MAX_TRACE struct trace_array *tr; if (!snapshot_at_boot) return; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->allocated_snapshot) continue; tracing_snapshot_instance(tr); trace_array_puts(tr, "** Boot snapshot taken **\n"); } #endif } void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (MEM_FAIL(!tracepoint_print_iter, "Failed to allocate trace iterator\n")) tracepoint_printk = 0; else static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); init_events(); } void __init trace_init(void) { trace_event_init(); if (boot_instance_index) enable_instances(); } __init static void clear_boot_tracer(void) { /* * The default tracer at boot buffer is an init section. * This function is called in lateinit. If we did not * find the boot tracer, then clear it out, to prevent * later registration from accessing the buffer that is * about to be freed. */ if (!default_bootup_tracer) return; printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", default_bootup_tracer); default_bootup_tracer = NULL; } #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __init static void tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Can not set tracing clock due to lockdown\n"); return; } printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" " \"trace_clock=local\"\n" "on the kernel command line\n"); tracing_set_clock(&global_trace, "global"); } } #else static inline void tracing_set_default_clock(void) { } #endif __init static int late_trace_init(void) { if (tracepoint_printk && tracepoint_printk_stop_on_boot) { static_key_disable(&tracepoint_printk_key.key); tracepoint_printk = 0; } if (traceoff_after_boot) tracing_off(); tracing_set_default_clock(); clear_boot_tracer(); return 0; } late_initcall_sync(late_trace_init);
3919 3926 3924 3920 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 // SPDX-License-Identifier: GPL-2.0-only // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner #include <linux/atomic.h> #include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kdb.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "internal.h" #include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. * * The state of the console is maintained in the "nbcon_state" atomic * variable. * * The console is locked when: * * - The 'prio' field contains the priority of the context that owns the * console. Only higher priority contexts are allowed to take over the * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked. * * - The 'cpu' field denotes on which CPU the console is locked. It is used * to prevent busy waiting on the same CPU. Also it informs the lock owner * that it has lost the lock in a more complex scenario when the lock was * taken over by a higher priority context, released, and taken on another * CPU with the same priority as the interrupted owner. * * The acquire mechanism uses a few more fields: * * - The 'req_prio' field is used by the handover approach to make the * current owner aware that there is a context with a higher priority * waiting for the friendly handover. * * - The 'unsafe' field allows to take over the console in a safe way in the * middle of emitting a message. The field is set only when accessing some * shared resources or when the console device is manipulated. It can be * cleared, for example, after emitting one character when the console * device is in a consistent state. * * - The 'unsafe_takeover' field is set when a hostile takeover took the * console in an unsafe state. The console will stay in the unsafe state * until re-initialized. * * The acquire mechanism uses three approaches: * * 1) Direct acquire when the console is not owned or is owned by a lower * priority context and is in a safe state. * * 2) Friendly handover mechanism uses a request/grant handshake. It is used * when the current owner has lower priority and the console is in an * unsafe state. * * The requesting context: * * a) Sets its priority into the 'req_prio' field. * * b) Waits (with a timeout) for the owning context to unlock the * console. * * c) Takes the lock and clears the 'req_prio' field. * * The owning context: * * a) Observes the 'req_prio' field set on exit from the unsafe * console state. * * b) Gives up console ownership by clearing the 'prio' field. * * 3) Unsafe hostile takeover allows to take over the lock even when the * console is an unsafe state. It is used only in panic() by the final * attempt to flush consoles in a try and hope mode. * * Note that separate record buffers are used in panic(). As a result, * the messages can be read and formatted without any risk even after * using the hostile takeover in unsafe state. * * The release function simply clears the 'prio' field. * * All operations on @console::nbcon_state are atomic cmpxchg based to * handle concurrency. * * The acquire/release functions implement only minimal policies: * * - Preference for higher priority contexts. * - Protection of the panic CPU. * * All other policy decisions must be made at the call sites: * * - What is marked as an unsafe section. * - Whether to spin-wait if there is already an owner and the console is * in an unsafe state. * - Whether to attempt an unsafe hostile takeover. * * The design allows to implement the well known: * * acquire() * output_one_printk_record() * release() * * The output of one printk record might be interrupted with a higher priority * context. The new owner is supposed to reprint the entire interrupted record * from scratch. */ /* Counter of active nbcon emergency contexts. */ static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0); /** * nbcon_state_set - Helper function to set the console state * @con: Console to update * @new: The new state to write * * Only to be used when the console is not yet or no longer visible in the * system. Otherwise use nbcon_state_try_cmpxchg(). */ static inline void nbcon_state_set(struct console *con, struct nbcon_state *new) { atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom); } /** * nbcon_state_read - Helper function to read the console state * @con: Console to read * @state: The state to store the result */ static inline void nbcon_state_read(struct console *con, struct nbcon_state *state) { state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state)); } /** * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state * @con: Console to update * @cur: Old/expected state * @new: New state * * Return: True on success. False on fail and @cur is updated. */ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur, struct nbcon_state *new) { return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); } /** * nbcon_seq_read - Read the current console sequence * @con: Console to read the sequence of * * Return: Sequence number of the next record to print on @con. */ u64 nbcon_seq_read(struct console *con) { unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); return __ulseq_to_u64seq(prb, nbcon_seq); } /** * nbcon_seq_force - Force console sequence to a specific value * @con: Console to work on * @seq: Sequence number value to set * * Only to be used during init (before registration) or in extreme situations * (such as panic with CONSOLE_REPLAY_ALL). */ void nbcon_seq_force(struct console *con, u64 seq) { /* * If the specified record no longer exists, the oldest available record * is chosen. This is especially important on 32bit systems because only * the lower 32 bits of the sequence number are stored. The upper 32 bits * are derived from the sequence numbers available in the ringbuffer. */ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); } /** * nbcon_seq_try_update - Try to update the console sequence number * @ctxt: Pointer to an acquire context that contains * all information about the acquire mode * @new_seq: The new sequence number to set * * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to * the 64bit value). This could be a different value than @new_seq if * nbcon_seq_force() was used or the current context no longer owns the * console. In the later case, it will stop printing anyway. */ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) { unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); struct console *con = ctxt->console; if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, __u64seq_to_ulseq(new_seq))) { ctxt->seq = new_seq; } else { ctxt->seq = nbcon_seq_read(con); } } /** * nbcon_context_try_acquire_direct - Try to acquire directly * @ctxt: The context of the caller * @cur: The current console state * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is neither the panic * CPU nor is this a reacquire. Or the current owner or * waiter has the same or higher priority. No acquire * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; do { /* * Panic does not imply that the console is owned. However, * since all non-panic CPUs are stopped during panic(), it * is safer to have them avoid gaining console ownership. * * One exception is when kdb has locked for printing on this CPU. * * Second exception is a reacquire (and an unsafe takeover * has not previously occurred) then it is allowed to attempt * a direct acquire in panic. This gives console drivers an * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ if (panic_on_other_cpu() && !kdb_printf_on_this_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; if (cur->unsafe) return -EBUSY; /* * The console should never be safe for a direct acquire * if an unsafe hostile takeover has ever happened. */ WARN_ON_ONCE(cur->unsafe_takeover); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) { /* * The request context is well defined by the @req_prio because: * * - Only a context with a priority higher than the owner can become * a waiter. * - Only a context with a priority higher than the waiter can * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * * 1. This context is currently a waiter. * 2. Another context with a higher priority than this context * directly takes ownership. * 3. The higher priority context releases the ownership. * 4. Another lower priority context takes the ownership. * 5. Another context with the same priority as this context * creates a request and starts waiting. * * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); } /** * nbcon_context_try_acquire_requested - Try to acquire after having * requested a handover * @ctxt: The context of the caller * @cur: The current console state * * This is a helper function for nbcon_context_try_acquire_handover(). * It is called when the console is in an unsafe state. The current * owner will release the console on exit from the unsafe region. * * Return: 0 on success and @cur is updated to the new console state. * Otherwise an error code on failure. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU * or this context is no longer the waiter. * * -EBUSY: The console is still locked. The caller should * continue waiting. * * Note: The caller must still remove the request when an error has occurred * except when this context is no longer the waiter. */ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; /* Note that the caller must still remove the request! */ if (panic_on_other_cpu()) return -EPERM; /* * Note that the waiter will also change if there was an unsafe * hostile takeover. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* If still locked, caller should continue waiting. */ if (cur->prio != NBCON_PRIO_NONE) return -EBUSY; /* * The previous owner should have never released ownership * in an unsafe region. */ WARN_ON_ONCE(cur->unsafe); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; if (!nbcon_state_try_cmpxchg(con, cur, &new)) { /* * The acquire could fail only when it has been taken * over by a higher priority context. */ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio)); return -EPERM; } /* Handover success. This context now owns the console. */ return 0; } /** * nbcon_context_try_acquire_handover - Try to acquire via handover * @ctxt: The context of the caller * @cur: The current console state * * The function must be called only when the context has higher priority * than the current owner and the console is in an unsafe state. * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY. * * The function sets "req_prio" field to make the current owner aware of * the request. Then it waits until the current owner releases the console, * or an even higher context takes over the request, or timeout expires. * * The current owner checks the "req_prio" field on exit from the unsafe * region and releases the console. It does not touch the "req_prio" field * so that the console stays reserved for the waiter. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or a higher priority context has taken over the * console or the handover request. * * -EBUSY: The current owner is on the same CPU so that the hand * shake could not work. Or the current owner is not * willing to wait (zero timeout). Or the console does * not enter the safe state before timeout passed. The * caller might still use the unsafe hostile takeover * when allowed. * * -EAGAIN: @cur has changed when creating the handover request. * The caller should retry with direct acquire. */ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; int timeout; int request_err = -EBUSY; /* * Check that the handover is called when the direct acquire failed * with -EBUSY. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); /* * Panic does not imply that the console is owned. However, it * is critical that non-panic CPUs during panic are unable to * wait for a handover in order to satisfy the assumptions of * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; /* * Console stays unsafe after an unsafe takeover until re-initialized. * Waiting is not going to help in this case. */ if (cur->unsafe_takeover) return -EBUSY; /* Is the caller willing to wait? */ if (ctxt->spinwait_max_us == 0) return -EBUSY; /* * Setup a request for the handover. The caller should try to acquire * the console directly when the current state has been modified. */ new.atom = cur->atom; new.req_prio = ctxt->prio; if (!nbcon_state_try_cmpxchg(con, cur, &new)) return -EAGAIN; cur->atom = new.atom; /* Wait until there is no owner and then acquire the console. */ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { /* On successful acquire, this request is cleared. */ request_err = nbcon_context_try_acquire_requested(ctxt, cur); if (!request_err) return 0; /* * If the acquire should be aborted, it must be ensured * that the request is removed before returning to caller. */ if (request_err == -EPERM) break; udelay(1); /* Re-read the state because some time has passed. */ nbcon_state_read(con, cur); } /* Timed out or aborted. Carefully remove handover request. */ do { /* * No need to remove request if there is a new waiter. This * can only happen if a higher priority context has taken over * the console or the handover request. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* Unset request for handover. */ new.atom = cur->atom; new.req_prio = NBCON_PRIO_NONE; if (nbcon_state_try_cmpxchg(con, cur, &new)) { /* * Request successfully unset. Report failure of * acquiring via handover. */ cur->atom = new.atom; return request_err; } /* * Unable to remove request. Try to acquire in case * the owner has released the lock. */ } while (nbcon_context_try_acquire_requested(ctxt, cur)); /* Lucky timing. The acquire succeeded while removing the request. */ return 0; } /** * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console even in the unsafe state. * * It can be permitted by setting the 'allow_unsafe_takeover' field only * by the final attempt to flush messages in panic(). * * Return: 0 on success. -EPERM when not allowed by the context. */ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; if (!ctxt->allow_unsafe_takeover) return -EPERM; /* Ensure caller is allowed to perform unsafe hostile takeovers. */ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC)) return -EPERM; /* * Check that try_acquire_direct() and try_acquire_handover() returned * -EBUSY in the right situation. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(cur->unsafe != true); do { new.atom = cur->atom; new.cpu = cpu; new.prio = ctxt->prio; new.unsafe |= cur->unsafe_takeover; new.unsafe_takeover |= cur->unsafe; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the * caller should check the current console state to see if it is * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { struct console *con = ctxt->console; struct nbcon_state cur; int err; nbcon_state_read(con, &cur); try_again: err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_handover(ctxt, &cur); if (err == -EAGAIN) goto try_again; if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_hostile(ctxt, &cur); out: if (err) return false; /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; /* Set the record sequence for this context to print. */ ctxt->seq = nbcon_seq_read(ctxt->console); return true; } static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* * A similar function, nbcon_waiter_matches(), only deals with * EMERGENCY and PANIC priorities. However, this function must also * deal with the NORMAL priority, which requires additional checks * and constraints. * * For the case where preemption and interrupts are disabled, it is * enough to also verify that the owning CPU has not changed. * * For the case where preemption or interrupts are enabled, an * external synchronization method *must* be used. In particular, * the driver-specific locking mechanism used in device_lock() * (including disabling migration) should be used. It prevents * scenarios such as: * * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and * is scheduled out. * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY * and releases it. * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] * and is scheduled out. * 4. [Task A] gets running on [CPU X] and sees that the console is * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) return false; if (cur->cpu != expected_cpu) return false; return true; } /** * nbcon_context_release - Release the console * @ctxt: The nbcon context from nbcon_context_try_acquire() */ static void nbcon_context_release(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { if (!nbcon_owner_matches(&cur, cpu, ctxt->prio)) break; new.atom = cur.atom; new.prio = NBCON_PRIO_NONE; /* * If @unsafe_takeover is set, it is kept set so that * the state remains permanently unsafe. */ new.unsafe |= cur.unsafe_takeover; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); ctxt->pbufs = NULL; } /** * nbcon_context_can_proceed - Check whether ownership can proceed * @ctxt: The nbcon context from nbcon_context_try_acquire() * @cur: The current console state * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * Must be invoked when entering the unsafe state to make sure that it still * owns the lock. Also must be invoked when exiting the unsafe context * to eventually free the lock for a higher priority context which asked * for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe * state. * * Also it can be called in the safe context before doing an expensive * safe operation. It does not make sense to do the operation when * a higher priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); /* Make sure this context still owns the console. */ if (!nbcon_owner_matches(cur, cpu, ctxt->prio)) return false; /* The console owner can proceed if there is no waiter. */ if (cur->req_prio == NBCON_PRIO_NONE) return true; /* * A console owner within an unsafe region is always allowed to * proceed, even if there are waiters. It can perform a handover * when exiting the unsafe region. Otherwise the waiter will * need to perform an unsafe hostile takeover. */ if (cur->unsafe) return true; /* Waiters always have higher priorities than owners. */ WARN_ON_ONCE(cur->req_prio <= cur->prio); /* * Having a safe point for take over and eventually a few * duplicated characters or a full line is way better than a * hostile takeover. Post processing can take care of the garbage. * Release and hand over. */ nbcon_context_release(ctxt); /* * It is not clear whether the waiter really took over ownership. The * outermost callsite must make the final decision whether console * ownership is needed for it to proceed. If yes, it must reacquire * ownership (possibly hostile) before carefully proceeding. * * The calling context no longer owns the console so go back all the * way instead of trying to implement reacquire heuristics in tons of * places. */ return false; } /** * nbcon_can_proceed - Check whether ownership can proceed * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * It is used in nbcon_enter_unsafe() to make sure that it still owns the * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock * for a higher priority context which asked for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe state. * * Also it can be called in the safe context before doing an expensive safe * operation. It does not make sense to do the operation when a higher * priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; nbcon_state_read(con, &cur); return nbcon_context_can_proceed(ctxt, &cur); } EXPORT_SYMBOL_GPL(nbcon_can_proceed); #define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true) #define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false) /** * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state * @ctxt: The nbcon context from nbcon_context_try_acquire() * @unsafe: The new value for the unsafe bit * * Return: True if the unsafe state was updated and this context still * owns the console. Otherwise false if ownership was handed * over or taken. * * This function allows console owners to modify the unsafe status of the * console. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. * * Internal helper to avoid duplicated code. */ static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe) { struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { /* * The unsafe bit must not be cleared if an * unsafe hostile takeover has occurred. */ if (!unsafe && cur.unsafe_takeover) goto out; if (!nbcon_context_can_proceed(ctxt, &cur)) return false; new.atom = cur.atom; new.unsafe = unsafe; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); cur.atom = new.atom; out: return nbcon_context_can_proceed(ctxt, &cur); } void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; wctxt->outbuf = buf; wctxt->len = len; nbcon_state_read(con, &cur); wctxt->unsafe_takeover = cur.unsafe_takeover; } /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool is_owner; is_owner = nbcon_context_enter_unsafe(ctxt); if (!is_owner) nbcon_write_context_set_buf(wctxt, NULL, 0); return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); /** * nbcon_exit_unsafe - Exit an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool ret; ret = nbcon_context_exit_unsafe(ctxt); if (!ret) nbcon_write_context_set_buf(wctxt, NULL, 0); return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** * nbcon_reacquire_nobuf - Reacquire a console after losing ownership * while printing * @wctxt: The write context that was handed to the write callback * * Since ownership can be lost at any time due to handover or takeover, a * printing context _must_ be prepared to back out immediately and * carefully. However, there are scenarios where the printing context must * reacquire ownership in order to finalize or revert hardware changes. * * This function allows a printing context to reacquire ownership using the * same priority as its previous ownership. * * Note that after a successful reacquire the printing context will have no * output buffer because that has been lost. This function cannot be used to * resume printing. */ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. If the caller * wants to do more it must reacquire the console first. * * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; struct printk_message pmsg = { .pbufs = ctxt->pbufs, }; unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; unsigned long ulseq; /* * This function should never be called for consoles that have not * implemented the necessary callback for writing: i.e. legacy * consoles and, when atomic, nbcon consoles with no write_atomic(). * Handle it as if ownership was lost and try to continue. * * Note that for nbcon consoles the write_thread() callback is * mandatory and was already checked in nbcon_alloc(). */ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || !(console_srcu_read_flags(con) & CON_NBCON))) { nbcon_context_release(ctxt); return false; } /* * The printk buffers are filled within an unsafe section. This * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from * clobbering each other. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true); if (!ctxt->backlog) return nbcon_context_exit_unsafe(ctxt); /* * @con->dropped is not protected in case of an unsafe hostile * takeover. In that situation the update can be racy so * annotate it accordingly. */ con_dropped = data_race(READ_ONCE(con->dropped)); dropped = con_dropped + pmsg.dropped; if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); /* * If the previous owner was assigned the same record, this context * has taken over ownership and is replaying the record. Prepend a * message to let the user know the record is replayed. */ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { console_prepend_replay(&pmsg); } else { /* * Ensure this context is still the owner before trying to * update @nbcon_prev_seq. Otherwise the value in @ulseq may * not be from the previous owner and instead be some later * value from the context that took over ownership. */ nbcon_state_read(con, &cur); if (!nbcon_context_can_proceed(ctxt, &cur)) return false; atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, __u64seq_to_ulseq(pmsg.seq)); } if (!nbcon_context_exit_unsafe(ctxt)) return false; /* For skipped records just update seq/dropped in @con. */ if (pmsg.outbuf_len == 0) goto update_con; /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); if (use_atomic) con->write_atomic(con, wctxt); else con->write_thread(con, wctxt); if (!wctxt->outbuf) { /* * Ownership was lost and reacquired by the driver. Handle it * as if ownership was lost. */ nbcon_context_release(ctxt); return false; } /* * Ownership may have been lost but _not_ reacquired by the driver. * This case is detected and handled when entering unsafe to update * dropped/seq values. */ /* * Since any dropped message was successfully output, reset the * dropped count for the console. */ dropped = 0; update_con: /* * The dropped count and the sequence number are updated within an * unsafe section. This limits update races to the panic context and * allows the panic context to win. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; if (dropped != con_dropped) { /* Counterpart to the READ_ONCE() above. */ WRITE_ONCE(con->dropped, dropped); } nbcon_seq_try_update(ctxt, pmsg.seq + 1); return nbcon_context_exit_unsafe(ctxt); } /* * nbcon_emit_one - Print one record for an nbcon console using the * specified callback * @wctxt: An initialized write context struct to use for this context * @use_atomic: True if the write_atomic() callback is to be used * * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This is an internal helper to handle the locking of the console before * calling nbcon_emit_next_record(). */ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; unsigned long flags; bool ret = false; if (!use_atomic) { con->device_lock(con, &flags); /* * Ensure this stays on the CPU to make handover and * takeover possible. */ cant_migrate(); } if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. * * The higher priority printing context takes over responsibility * to print the pending records. */ if (!nbcon_emit_next_record(wctxt, use_atomic)) goto out; nbcon_context_release(ctxt); ret = ctxt->backlog; out: if (!use_atomic) con->device_unlock(con, flags); return ret; } /** * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup * @con: Console to operate on * @ctxt: The nbcon context from nbcon_context_try_acquire() * * Return: True if the thread should shutdown or if the console is * allowed to print and a record is available. False otherwise. * * After the thread wakes up, it must first check if it should shutdown before * attempting any printing. */ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { bool ret = false; short flags; int cookie; if (kthread_should_stop()) return true; /* * Block the kthread when the system is in an emergency or panic mode. * It increases the chance that these contexts would be able to show * the messages directly. And it reduces the risk of interrupted writes * where the context with a higher priority takes over the nbcon console * ownership in the middle of a message. */ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || unlikely(panic_in_progress())) return false; cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); if (console_is_usable(con, flags, false)) { /* Bring the sequence in @ctxt up to date */ ctxt->seq = nbcon_seq_read(con); ret = prb_read_valid(prb, ctxt->seq, NULL); } console_srcu_read_unlock(cookie); return ret; } /** * nbcon_kthread_func - The printer thread function * @__console: Console to operate on * * Return: 0 */ static int nbcon_kthread_func(void *__console) { struct console *con = __console; struct nbcon_write_context wctxt = { .ctxt.console = con, .ctxt.prio = NBCON_PRIO_NORMAL, }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); short con_flags; bool backlog; int cookie; wait_for_event: /* * Guarantee this task is visible on the rcuwait before * checking the wake condition. * * The full memory barrier within set_current_state() of * ___rcuwait_wait_event() pairs with the full memory * barrier within rcuwait_has_sleeper(). * * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. */ rcuwait_wait_event(&con->rcuwait, nbcon_kthread_should_wakeup(con, ctxt), TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ do { if (kthread_should_stop()) return 0; /* * Block the kthread when the system is in an emergency or panic * mode. See nbcon_kthread_should_wakeup() for more details. */ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || unlikely(panic_in_progress())) goto wait_for_event; backlog = false; /* * Keep the srcu read lock around the entire operation so that * synchronize_srcu() can guarantee that the kthread stopped * or suspended printing. */ cookie = console_srcu_read_lock(); con_flags = console_srcu_read_flags(con); if (console_is_usable(con, con_flags, false)) backlog = nbcon_emit_one(&wctxt, false); console_srcu_read_unlock(cookie); cond_resched(); } while (backlog); goto wait_for_event; } /** * nbcon_irq_work - irq work to wake console printer thread * @irq_work: The irq work to operate on */ static void nbcon_irq_work(struct irq_work *irq_work) { struct console *con = container_of(irq_work, struct console, irq_work); nbcon_kthread_wake(con); } static inline bool rcuwait_has_sleeper(struct rcuwait *w) { /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the rcuwait is empty. * * This full memory barrier pairs with the full memory barrier within * set_current_state() of ___rcuwait_wait_event(), which is called * after prepare_to_rcuwait() adds the waiter but before it has * checked the wait condition. * * This pairs with nbcon_kthread_func:A. */ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ return rcuwait_active(w); } /** * nbcon_kthreads_wake - Wake up printing threads using irq_work */ void nbcon_kthreads_wake(void) { struct console *con; int cookie; if (!printk_kthreads_running) return; /* * It is not allowed to call this function when console irq_work * is blocked. */ if (WARN_ON_ONCE(console_irqwork_blocked)) return; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) continue; /* * Only schedule irq_work if the printing thread is * actively waiting. If not waiting, the thread will * notice by itself that it has work to do. */ if (rcuwait_has_sleeper(&con->rcuwait)) irq_work_queue(&con->irq_work); } console_srcu_read_unlock(cookie); } /* * nbcon_kthread_stop - Stop a console printer thread * @con: Console to operate on */ void nbcon_kthread_stop(struct console *con) { lockdep_assert_console_list_lock_held(); if (!con->kthread) return; kthread_stop(con->kthread); con->kthread = NULL; } /** * nbcon_kthread_create - Create a console printer thread * @con: Console to operate on * * Return: True if the kthread was started or already exists. * Otherwise false and @con must not be registered. * * This function is called when it will be expected that nbcon consoles are * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL * will be no longer flushed by the legacy loop. This is why failure must * be fatal for console registration. * * If @con was already registered and this function fails, @con must be * unregistered before the global state variable @printk_kthreads_running * can be set. */ bool nbcon_kthread_create(struct console *con) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); if (con->kthread) return true; kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); if (WARN_ON(IS_ERR(kt))) { con_printk(KERN_ERR, con, "failed to start printing thread\n"); return false; } con->kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(con->kthread, -20); return true; } /* Track the nbcon emergency nesting per CPU. */ static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; /** * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer * * Context: For reading, any context. For writing, any context which could * not be migrated to another CPU. * Return: Either a pointer to the per CPU emergency nesting counter of * the current CPU or to the init data during early boot. * * The function is safe for reading per-CPU variables in any context because * preemption is disabled if the current CPU is in the emergency state. See * also nbcon_cpu_emergency_enter(). */ static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) { /* * The value of __printk_percpu_data_ready gets set in normal * context and before SMP initialization. As a result it could * never change while inside an nbcon emergency section. */ if (!printk_percpu_data_ready()) return &early_nbcon_pcpu_emergency_nesting; return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); } /** * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon * printing on the current CPU * * Context: Any context. * Return: The nbcon_prio to use for acquiring an nbcon console in this * context for printing. * * The function is safe for reading per-CPU data in any context because * preemption is disabled if the current CPU is in the emergency or panic * state. */ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (*cpu_emergency_nesting) return NBCON_PRIO_EMERGENCY; return NBCON_PRIO_NORMAL; } /* * Track if it is allowed to perform unsafe hostile takeovers of console * ownership. When true, console drivers might perform unsafe actions while * printing. It is externally available via nbcon_allow_unsafe_takeover(). */ static bool panic_nbcon_allow_unsafe_takeover; /** * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed * * Return: True, when it is permitted to perform unsafe console printing * * This is also used by console_is_usable() to determine if it is allowed to * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). */ bool nbcon_allow_unsafe_takeover(void) { return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; } /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts * @con: The console to print on * @handover: Will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding * both the console_lock and the SRCU read lock. Otherwise it * is set to false. * @cookie: The cookie from the SRCU read lock. * @use_atomic: Set true when called in an atomic or unknown context. * It affects which nbcon callback will be used: write_atomic() * or write_thread(). * * When false, the write_thread() callback is used and would be * called in a preemtible context unless disabled by the * device_lock. The legacy handover is not allowed in this mode. * * Context: Any context except NMI. * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This function is meant to be called by console_flush_all() to print records * on nbcon consoles from legacy context (printing via console unlocking). * Essentially it is the nbcon version of console_emit_next_record(). */ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, int cookie, bool use_atomic) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); unsigned long flags; bool progress; ctxt->console = con; ctxt->prio = nbcon_get_default_prio(); if (use_atomic) { /* * In an atomic or unknown context, use the same procedure as * in console_emit_next_record(). It allows to handover. */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); } progress = nbcon_emit_one(&wctxt, use_atomic); if (use_atomic) { start_critical_timings(); *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } else { /* Non-atomic does not perform legacy spinning handovers. */ *handover = false; } return progress; } /** * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. * * Errors: * * -EPERM: Unable to acquire console ownership. * * -EAGAIN: Another context took over ownership while printing. * * -ENOENT: A record before @stop_seq is not available. * * If flushing up to @stop_seq was not successful, it only makes sense for the * caller to try again when -EAGAIN was returned. When -EPERM is returned, * this context is not allowed to acquire the console. When -ENOENT is * returned, it cannot be expected that the unfinalized record will become * available. */ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); int err = 0; ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); while (nbcon_seq_read(con) < stop_seq) { if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. */ if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; nbcon_context_release(ctxt); if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) err = -ENOENT; break; } } return err; } /** * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; unsigned long flags; int err; again: /* * Atomic flushing does not use console driver synchronization (i.e. * it does not hold the port lock for uart consoles). Therefore IRQs * must be disabled to avoid being interrupted and then calling into * a driver that will deadlock trying to acquire console ownership. */ local_irq_save(flags); err = __nbcon_atomic_flush_pending_con(con, stop_seq); local_irq_restore(flags); /* * If there was a new owner (-EPERM, -EAGAIN), that context is * responsible for completing. * * Do not wait for records not yet finalized (-ENOENT) to avoid a * possible deadlock. They will either get flushed by the writer or * eventually skipped on panic CPU. */ if (err) return; /* * If flushing was successful but more records are available, this * context must flush those remaining records if the printer thread * is not available do it. */ printk_get_console_flush_type(&ft); if (!ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { stop_seq = prb_next_reserve_seq(prb); goto again; } } /** * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record */ static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); if (!(flags & CON_NBCON)) continue; if (!console_is_usable(con, flags, true)) continue; if (nbcon_seq_read(con) >= stop_seq) continue; nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } /** * nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * * Flush the backlog up through the currently newest record. Any new * records added while flushing will not be flushed if there is another * context available to handle the flushing. This is to avoid one CPU * printing unbounded because other CPUs continue to add records. */ void nbcon_atomic_flush_pending(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their * write_atomic() callback and allowing unsafe hostile takeovers * * Flush the backlog up through the currently newest record. Unsafe hostile * takeovers will be performed, if necessary. */ void nbcon_atomic_flush_unsafe(void) { panic_nbcon_allow_unsafe_takeover = true; __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); panic_nbcon_allow_unsafe_takeover = false; } /** * nbcon_cpu_emergency_enter - Enter an emergency section where printk() * messages for that CPU are flushed directly * * Context: Any context. Disables preemption. * * When within an emergency section, printk() calls will attempt to flush any * pending messages in the ringbuffer. */ void nbcon_cpu_emergency_enter(void) { unsigned int *cpu_emergency_nesting; preempt_disable(); atomic_inc(&nbcon_cpu_emergency_cnt); cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } /** * nbcon_cpu_emergency_exit - Exit an emergency section * * Context: Within an emergency section. Enables preemption. */ void nbcon_cpu_emergency_exit(void) { unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; /* * Wake up kthreads because there might be some pending messages * added by other CPUs with normal priority since the last flush * in the emergency context. */ if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) { if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); } } preempt_enable(); } /** * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * * Return: True if the console was fully allocated and initialized. * Otherwise @con must not be registered. * * When allocation and init was successful, the console must be properly * freed using nbcon_free() once it is no longer needed. */ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; /* Synchronize the kthread start. */ lockdep_assert_console_list_lock_held(); /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); /* * Initialize @nbcon_seq to the highest possible sequence number so * that practically speaking it will have nothing to print until a * desired initial sequence number has been set via nbcon_seq_force(). */ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); if (con->flags & CON_BOOT) { /* * Boot console printing is synchronized with legacy console * printing, so boot consoles can share the same global printk * buffers. */ con->pbufs = &printk_shared_pbufs; } else { con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); if (!con->pbufs) { con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); return false; } if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } /* Might be the first kthread. */ printk_kthreads_running = true; } } return true; } /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data * * Important: @have_nbcon_console must be updated before calling * this function. In particular, it can be set only when there * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; /* Synchronize the kthread stop. */ lockdep_assert_console_list_lock_held(); if (printk_kthreads_running) { nbcon_kthread_stop(con); /* Might be the last nbcon console. * * Do not rely on printk_kthreads_check_locked(). It is not * called in some code paths, see nbcon_free() callers. */ if (!have_nbcon_console) printk_kthreads_running = false; } nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ if (!(con->flags & CON_BOOT)) kfree(con->pbufs); con->pbufs = NULL; } /** * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * * Context: Under the locking mechanism implemented in * @con->device_lock() including disabling migration. * Return: True if the console was acquired. False otherwise. * * Console drivers will usually use their own internal synchronization * mechasism to synchronize between console printing and non-printing * activities (such as setting baud rates). However, nbcon console drivers * supporting atomic consoles may also want to mark unsafe sections when * performing non-printing activities in order to synchronize against their * atomic_write() callback. * * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL * and marks it unsafe for handover/takeover. */ bool nbcon_device_try_acquire(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); cant_migrate(); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); /** * nbcon_device_release - Exit unsafe section and release the nbcon console * @con: The nbcon console acquired in nbcon_device_try_acquire() */ void nbcon_device_release(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); struct console_flush_type ft; int cookie; if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * This context must flush any new records added while the console * was locked if the printer thread is not available to do it. The * console_srcu_read_lock must be taken to ensure the console is * usable throughout flushing. */ cookie = console_srcu_read_lock(); printk_get_console_flush_type(&ft); if (console_is_usable(con, console_srcu_read_flags(con), true) && !ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { /* * If nbcon_atomic flushing is not available, fallback to * using the legacy loop. */ if (ft.nbcon_atomic) { __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { defer_console_output(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); /** * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * @wctxt: The nbcon write context to be used on success * * Context: Under console_srcu_read_lock() for emitting a single kdb message * using the given con->write_atomic() callback. Can be called * only when the console is usable at the moment. * * Return: True if the console was acquired. False otherwise. * * kdb emits messages on consoles registered for printk() without * storing them into the ring buffer. It has to acquire the console * ownerhip so that it could call con->write_atomic() callback a safe way. * * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY * and marks it unsafe for handover/takeover. */ bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_EMERGENCY; if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } /** * nbcon_kdb_release - Exit unsafe section and release the nbcon console * * @wctxt: The nbcon write context initialized by a successful * nbcon_kdb_try_acquire() */ void nbcon_kdb_release(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * Flush any new printk() messages added when the console was blocked. * Only the console used by the given write context was blocked. * The console was locked only when the write_atomic() callback * was usable. */ __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); }
113 114 5 55 34 11 65 33 49 56 13 10 57 9 45 32 45 51 14 61 30 29 45 8 37 30 30 29 9 9 32 45 61 61 32 45 45 38 7 45 45 8 24 8 61 61 61 61 32 45 30 10 29 30 29 29 10 30 10 29 29 29 25 26 9 25 5 3 25 25 51 52 61 61 51 52 1 1 1 33 33 33 32 1 32 8 24 32 15 14 15 5 10 7 7 7 7 49 50 179 18 18 30 30 30 49 49 49 45 6 4 45 11 11 11 7 4 25 25 25 24 16 16 13 13 74 130 130 18 7 13 46 131 24 96 96 96 34 63 38 59 96 5 30 67 94 94 78 31 30 65 94 4 4 2 2 2 2 2 2 12 2 2 8 8 3 2 3 1 2 3 5 8 1 3 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_tunnel.h" static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; u16 vid = *(u16 *)arg->key; return vle->vid != vid; } static const struct rhashtable_params br_vlan_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, vnode), .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) { return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; } static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) return; smp_wmb(); vg->pvid = 0; } /* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. */ static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, bool commit) { struct net_bridge_vlan_group *vg; bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); /* check if anything would be changed on commit */ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); if (!commit) goto out; if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v); else __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; out: return change; } static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) { return __vlan_flags_update(v, flags, false); } static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) { __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } static void __vlan_add_list(struct net_bridge_vlan *v) { struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ err = br_switchdev_port_vlan_del(dev, v->vid); if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) vlan_vid_del(dev, br->vlan_proto, v->vid); return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * br_vlan_get_master(struct net_bridge *br, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { bool changed; /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; refcount_set(&masterv->refcnt, 1); return masterv; } refcount_inc(&masterv->refcnt); return masterv; } static void br_master_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(!br_vlan_is_master(v)); free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; if (!br_vlan_is_master(masterv)) return; vg = br_vlan_group(masterv->br); if (refcount_dec_and_test(&masterv->refcnt)) { rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); br_multicast_toggle_one_vlan(masterv, false); br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } static void nbp_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_init_state(struct net_bridge_vlan *v) { struct net_bridge *br; if (br_vlan_is_master(v)) br = v->br; else br = v->port->br; if (br_opt_get(br, BROPT_MST_ENABLED)) { br_mst_vlan_init_state(v); return; } v->state = BR_STATE_FORWARDING; v->msti = 0; } /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: * 1. vlan is being added on a port (no master flags, global entry exists) * 2. vlan is being added on a bridge (both master and brentry flags) * 3. vlan is being added on a port, but a global entry didn't exist which * is being created right now (master flag set, brentry flag unset), the * global entry is used for global per-vlan features, but not for filtering * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; vg = nbp_vlan_group(p); } if (p) { /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { bool changed; err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, &changed, extack); if (err) goto out_filt; if (changed) br_vlan_notify(br, NULL, v->vid, 0, RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) { err = -ENOMEM; goto out_filt; } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; } v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { if (br_vlan_should_use(v)) { err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err && err != -EOPNOTSUPP) goto out; } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) { err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } } vg->num_vlans++; } /* set the state before publishing */ br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; out_fdb_insert: if (br_vlan_should_use(v)) { br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); vg->num_vlans--; } out_filt: if (p) { __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); v->stats = NULL; br_vlan_put_master(masterv); v->brvlan = NULL; } } else { br_switchdev_port_vlan_del(dev, v->vid); } goto out; } static int __vlan_del(struct net_bridge_vlan *v) { struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0; if (br_vlan_is_master(v)) { vg = br_vlan_group(v->br); } else { p = v->port; vg = nbp_vlan_group(v->port); masterv = v->brvlan; } __vlan_delete_pvid(vg, v->vid); if (p) { err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { err = br_switchdev_port_vlan_del(v->br->dev, v->vid); if (err && err != -EOPNOTSUPP) goto out; err = 0; } if (br_vlan_should_use(v)) { v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans--; } if (masterv != v) { vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); br_multicast_toggle_one_vlan(v, false); br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); out: return err; } static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); vlan_tunnel_deinit(vg); kfree(vg); } static void __vlan_flush(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { /* take care of disjoint ranges */ if (!v_start) { v_start = vlan->vid; } else if (vlan->vid - v_end != 1) { /* found range end, notify and start next one */ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); v_start = vlan->vid; } v_end = vlan->vid; err = __vlan_del(vlan); if (err) { br_err(br, "port %u(%s) failed to delete vlan %d: %pe\n", (unsigned int) p->port_no, p->dev->name, vlan->vid, ERR_PTR(err)); } } /* notify about the last/whole vlan range */ if (v_start) br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id has untagged flag set, * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { kfree_skb(skb); return NULL; } } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->tx_bytes, skb->len); u64_stats_inc(&stats->tx_packets); u64_stats_update_end(&stats->syncp); } /* If the skb will be sent using forwarding offload, the assumption is * that the switchdev will inject the packet into hardware together * with the bridge VLAN, so that it can be forwarded according to that * VLAN. The switchdev should deal with popping the VLAN header in * hardware on each egress port as appropriate. So only strip the VLAN * header if forwarding offload is not being used. */ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { kfree_skb(skb); return NULL; } out: return skb; } /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (unlikely(!skb)) return false; skb_pull(skb, ETH_HLEN); skb_reset_mac_len(skb); *vid = 0; tagged = false; } else { tagged = true; } } else { /* Untagged frame */ tagged = false; } if (!*vid) { u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, we know that skb->vlan_tci VID * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; /* if snooping and stats are disabled we can avoid the lookup */ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); if (!br_vlan_state_allowed(*state, true)) goto drop; } return true; } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_state(v); if (!br_vlan_state_allowed(*state, true)) goto drop; } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb->len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } *vlan = v; return true; drop: kfree_skb(skb); return false; } bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); if (v && br_vlan_should_use(v) && br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; } /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { *vid = br_get_pvid(vg); if (!*vid || !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } v = br_vlan_find(vg, *vid); if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; } static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u16 flags, bool *changed, struct netlink_ext_ack *extack) { bool becomes_brentry = false; bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) return -EINVAL; becomes_brentry = true; } else { would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, * time to notify them now. */ if (becomes_brentry || would_change) { err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, would_change, extack); if (err && err != -EOPNOTSUPP) return err; } if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; } refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; br_multicast_toggle_one_vlan(vlan, true); } __vlan_flags_commit(vlan, flags); if (would_change) *changed = true; return 0; err_fdb_insert: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); } else { *changed = true; } return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int br_vlan_delete(struct net_bridge *br, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = br_vlan_group(br); v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); vlan_tunnel_info_del(vg, v); return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { if (!vg) return NULL; return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ /* Provider Bridge Group Address */ br->group_addr[5] = 0x08; } spin_unlock_bh(&br->lock); } /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ~(1u << br->group_addr[5]); } int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = val, }; int err; if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) { br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; } br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); br_multicast_toggle_vlan_snooping(br, false, NULL); } return 0; } bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) { struct net_bridge *br = netdev_priv(dev); *p_proto = ntohs(br->vlan_proto); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_protocol = ntohs(proto), }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } } br->vlan_proto = proto; recalculate_group_addr(br); br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, oldproto, vlan->vid); } } return 0; err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } } return err; } int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) { switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); break; default: return -EINVAL; } return 0; } int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) { struct net_bridge_port *p; /* allow to change the option if there are no port vlans configured */ list_for_each_entry(p, &br->port_list, list) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); if (vg->num_vlans) return -EBUSY; } switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; } return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; if (vid != vg->pvid) return false; v = br_vlan_lookup(&vg->vlan_hash, vid); if (v && br_vlan_should_use(v) && (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return true; return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) { struct net_bridge_port *p; u16 pvid = br->default_pvid; /* Disable default_pvid on all ports where it is still * configured. */ if (vlan_default_pvid(br_vlan_group(br), pvid)) { if (!br_vlan_delete(br, pvid)) br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } list_for_each_entry(p, &br->port_list, list) { if (vlan_default_pvid(nbp_vlan_group(p), pvid) && !nbp_vlan_delete(p, pvid)) br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; } int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; unsigned long *changed; bool vlchange; u16 old_pvid; int err = 0; if (!pvid) { br_vlan_disable_default_pvid(br); return 0; } changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; old_pvid = br->default_pvid; /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = br_vlan_group(br); pvent = br_vlan_find(vg, pvid); if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, extack); if (err) goto out; if (br_vlan_delete(br, old_pvid)) br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); __set_bit(0, changed); } list_for_each_entry(p, &br->port_list, list) { /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = nbp_vlan_group(p); if ((old_pvid && !vlan_default_pvid(vg, old_pvid)) || br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, extack); if (err) goto err_port; if (nbp_vlan_delete(p, old_pvid)) br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); __set_bit(p->port_no, changed); } br->default_pvid = pvid; out: bitmap_free(changed); return err; err_port: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!test_bit(p->port_no, changed)) continue; if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); } nbp_vlan_delete(p, pvid); br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); } br_vlan_delete(br, pvid); br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; if (val >= VLAN_VID_MASK) return -EINVAL; if (pvid == br->default_pvid) goto out; /* Only allow default pvid change when filtering is disabled */ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; } err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; rcu_assign_pointer(br->vlgrp, vg); out: return ret; err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); goto out; } int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); if (!vg) goto out; ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { bool changed; ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &changed, extack); if (ret) goto err_vlan_add; br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: err_vlan_enabled: kfree(vg); goto out; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { bool would_change = __vlan_flags_would_change(vlan, flags); if (would_change) { /* Pass the flags to the hardware bridge */ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, true, extack); if (ret && ret != -EOPNOTSUPP) return ret; } __vlan_flags_commit(vlan, flags); *changed = would_change; return 0; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->vid = vid; vlan->port = port; ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else *changed = true; return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan *v; ASSERT_RTNL(); v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); u64_stats_add(&stats->tx_bytes, txbytes); u64_stats_add(&stats->tx_packets, txpackets); } } int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid); int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { struct net_bridge_vlan_group *vg; int idx = ctx->num_vlans - 1; u16 vid; path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; vg = br_vlan_group_rcu(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { vid = ctx->vlan[idx].id; } else { path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; vid = br_get_pvid(vg); } path->bridge.vlan_id = vid; path->bridge.vlan_proto = br->vlan_proto; } int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return 0; vg = nbp_vlan_group_rcu(dst); v = br_vlan_find(vg, path->bridge.vlan_id); if (!v || !br_vlan_should_use(v)) return -EINVAL; if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return 0; if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; else path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; return 0; } int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) { int found; rcu_read_lock(); found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, NULL); rcu_read_unlock(); return !!found; } struct br_vlan_bind_walk_data { u16 vid; struct net_device *result; }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, struct netdev_nested_priv *priv) { struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && vlan_dev_priv(dev)->vlan_id == data->vid) { data->result = dev; found = 1; } return found; } static struct net_device * br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) { struct br_vlan_bind_walk_data data = { .vid = vid, }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, &priv); rcu_read_unlock(); return data.result; } static bool br_vlan_is_dev_up(const struct net_device *dev) { return !!(dev->flags & IFF_UP) && netif_oper_up(dev); } static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, struct net_device *vlan_dev) { u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; bool has_carrier = false; if (!netif_carrier_ok(br->dev)) { netif_carrier_off(vlan_dev); return; } list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { has_carrier = true; break; } } if (has_carrier) netif_carrier_on(vlan_dev); else netif_carrier_off(vlan_dev); } static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); struct net_bridge_vlan *vlan; struct net_device *vlan_dev; list_for_each_entry(vlan, &vg->vlan_list, vlist) { vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vlan->vid); if (vlan_dev) { if (br_vlan_is_dev_up(p->dev)) { if (netif_carrier_ok(p->br->dev)) netif_carrier_on(vlan_dev); } else { br_vlan_set_vlan_dev_state(p->br, vlan_dev); } } } } static void br_vlan_toggle_bridge_binding(struct net_device *br_dev, bool enable) { struct net_bridge *br = netdev_priv(br_dev); if (enable) br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); else br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, br_vlan_has_upper_bind_vlan_dev(br_dev)); } static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) { struct net_bridge *br = netdev_priv(dev); if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; br_vlan_toggle_bridge_binding(dev, linking); if (linking) br_vlan_set_vlan_dev_state(br, upper_dev); } struct br_vlan_link_state_walk_data { struct net_bridge *br; }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, struct netdev_nested_priv *priv) { struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); return 0; } static void br_vlan_link_state_change(struct net_device *dev, struct net_bridge *br) { struct br_vlan_link_state_walk_data data = { .br = br }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, &priv); rcu_read_unlock(); } /* Must be protected by RTNL. */ static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) { struct net_device *vlan_dev; if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); if (vlan_dev) br_vlan_set_vlan_dev_state(p->br, vlan_dev); } /* Must be protected by RTNL. */ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); int vlcmd = 0, ret = 0; bool changed = false; switch (event) { case NETDEV_REGISTER: ret = br_vlan_add(br, br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: changed = !br_vlan_delete(br, br->default_pvid); vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; br_vlan_upper_change(dev, info->upper_dev, info->linking); break; case NETDEV_CHANGE: case NETDEV_UP: if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) break; br_vlan_link_state_change(dev, br); break; } if (changed) br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev); struct net_bridge *br = netdev_priv(br_dev); bool bridge_binding; switch (event) { case NETDEV_CHANGE: case NETDEV_UP: break; default: return; } bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING; br_vlan_toggle_bridge_binding(br_dev, bridge_binding); if (bridge_binding) br_vlan_set_vlan_dev_state(br, vlan_dev); else if (!bridge_binding && netif_carrier_ok(br_dev)) netif_carrier_on(vlan_dev); } /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; switch (event) { case NETDEV_CHANGE: case NETDEV_DOWN: case NETDEV_UP: br_vlan_set_all_vlan_dev_state(p); break; } } static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); if (!nest) return false; br_vlan_get_stats(v, &stats); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, u64_stats_read(&stats.tx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, u64_stats_read(&stats.tx_packets), BRIDGE_VLANDB_STATS_PAD)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, const struct net_bridge_port *p, u16 flags, bool dump_stats) { struct bridge_vlan_info info; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); if (!nest) return false; memset(&info, 0, sizeof(info)); info.vid = vid; if (flags & BRIDGE_VLAN_INFO_UNTAGGED) info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (flags & BRIDGE_VLAN_INFO_PVID) info.flags |= BRIDGE_VLAN_INFO_PVID; if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) goto out_err; if (vid_range && vid < vid_range && !(flags & BRIDGE_VLAN_INFO_PVID) && nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; if (v_opts) { if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) goto out_err; } nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + br_vlan_opts_nl_size(); /* bridge vlan options */ } void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v = NULL; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 flags = 0; int ifindex; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); if (p) { ifindex = p->dev->ifindex; vg = nbp_vlan_group(p); net = dev_net(p->dev); } else { ifindex = br->dev->ifindex; vg = br_vlan_group(br); net = dev_net(br->dev); } skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = ifindex; switch (cmd) { case RTM_NEWVLAN: /* need to find the vlan due to flags/options */ v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) goto out_kfree; flags = v->flags; if (br_get_pvid(vg) == v->vid) flags |= BRIDGE_VLAN_INFO_PVID; break; case RTM_DELVLAN: break; default: goto out_kfree; } if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); out_kfree: kfree_skb(skb); } /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; struct net_bridge_port *p; struct br_vlan_msg *bvm; struct net_bridge *br; int err = 0; u16 pvid; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) return -EINVAL; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); p = NULL; } else { /* global options are dumped only for bridge devices */ if (dump_global) return 0; p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; vg = nbp_vlan_group_rcu(p); br = p->br; } if (!vg) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = PF_BRIDGE; bvm->ifindex = dev->ifindex; pvid = br_get_pvid(vg); /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; continue; } if (!range_start) { range_start = v; range_end = v; continue; } if (dump_global) { if (br_vlan_global_opts_can_enter_range(v, range_end)) goto update_end; if (!br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } else if (dump_stats || v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } update_end: range_end = v; } /* err will be 0 and range_start will be set in 3 cases here: * - first vlan (range_start == range_end) * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ if (!err && range_start) { if (dump_global && !br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) err = -EMSGSIZE; else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; u32 dump_flags = 0; err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, br_vlan_db_dump_pol, cb->extack); if (err < 0) return err; bvm = nlmsg_data(cb->nlh); if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); rcu_read_lock(); if (bvm->ifindex) { dev = dev_get_by_index_rcu(net, bvm->ifindex); if (!dev) { err = -ENODEV; goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (idx < s_idx) goto skip; err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { [BRIDGE_VLANDB_ENTRY_INFO] = NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int br_vlan_rtm_process_one(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; bool changed = false, skip_processing = false; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0, cmdmap = 0; struct net_bridge *br; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (WARN_ON(!p)) return -ENODEV; br = p->br; vg = nbp_vlan_group(p); } if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, br_vlan_db_policy, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); return -EINVAL; } memset(&vrange_end, 0, sizeof(vrange_end)); vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); return -EINVAL; } if (!br_vlan_valid_id(vinfo->vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); /* validate user-provided flags without RANGE_BEGIN */ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; /* vinfo_last is the range start, vinfo the range end */ vinfo_last = vinfo; vinfo = &vrange_end; if (!br_vlan_valid_id(vinfo->vid, extack) || !br_vlan_valid_range(vinfo, vinfo_last, extack)) return -EINVAL; } switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); break; case RTM_DELVLAN: cmdmap = RTM_DELLINK; break; } if (!skip_processing) { struct bridge_vlan_info *tmp_last = vinfo_last; /* br_process_vlan_info may overwrite vinfo_last */ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, &changed, extack); /* notify first if anything changed */ if (changed) br_ifinfo_notify(cmdmap, br, p); if (err) return err; } /* deal with options */ if (cmd == RTM_NEWVLAN) { struct net_bridge_vlan *range_start, *range_end; if (vinfo_last) { range_start = br_vlan_find(vg, vinfo_last->vid); range_end = br_vlan_find(vg, vinfo->vid); } else { range_start = br_vlan_find(vg, vinfo->vid); range_end = range_start; } err = br_vlan_process_options(br, p, range_start, range_end, tb, extack); } return err; } static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; struct nlattr *attr; int err, vlans = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, extack); if (err < 0) return err; bvm = nlmsg_data(nlh); dev = __dev_get_by_index(net, bvm->ifindex); if (!dev) return -ENODEV; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); return -EINVAL; } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { switch (nla_type(attr)) { case BRIDGE_VLANDB_ENTRY: err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, extack); break; case BRIDGE_VLANDB_GLOBAL_OPTIONS: err = br_vlan_rtm_process_global_options(dev, attr, nlh->nlmsg_type, extack); break; default: continue; } vlans++; if (err) break; } if (!vlans) { NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); err = -EINVAL; } return err; } static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = { {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0}, }; int br_vlan_rtnl_init(void) { return rtnl_register_many(br_vlan_rtnl_msg_handlers); } void br_vlan_rtnl_uninit(void) { rtnl_unregister_many(br_vlan_rtnl_msg_handlers); }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PFXLEN_H #define _PFXLEN_H #include <asm/byteorder.h> #include <linux/netfilter.h> #include <net/tcp.h> /* Prefixlen maps, by Jan Engelhardt */ extern const union nf_inet_addr ip_set_netmask_map[]; extern const union nf_inet_addr ip_set_hostmask_map[]; static inline __be32 ip_set_netmask(u8 pfxlen) { return ip_set_netmask_map[pfxlen].ip; } static inline const __be32 * ip_set_netmask6(u8 pfxlen) { return &ip_set_netmask_map[pfxlen].ip6[0]; } static inline u32 ip_set_hostmask(u8 pfxlen) { return (__force u32) ip_set_hostmask_map[pfxlen].ip; } static inline const __be32 * ip_set_hostmask6(u8 pfxlen) { return &ip_set_hostmask_map[pfxlen].ip6[0]; } extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr); #define ip_set_mask_from_to(from, to, cidr) \ do { \ from &= ip_set_hostmask(cidr); \ to = from | ~ip_set_hostmask(cidr); \ } while (0) static inline void ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip->ip6[0] &= ip_set_netmask6(prefix)[0]; ip->ip6[1] &= ip_set_netmask6(prefix)[1]; ip->ip6[2] &= ip_set_netmask6(prefix)[2]; ip->ip6[3] &= ip_set_netmask6(prefix)[3]; } #endif /*_PFXLEN_H */
34 34 25 12 34 33 17270 17251 25 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .id = &safesetid_lsmid, .init = safesetid_security_init, .initcall_fs = safesetid_init_securityfs, };
32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * proc.c - procfs support for Protocol family CAN core module * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/if_arp.h> #include <linux/can/can-ml.h> #include <linux/can/core.h> #include "af_can.h" /* * proc filenames for the PF_CAN core */ #define CAN_PROC_STATS "stats" #define CAN_PROC_RESET_STATS "reset_stats" #define CAN_PROC_RCVLIST_ALL "rcvlist_all" #define CAN_PROC_RCVLIST_FIL "rcvlist_fil" #define CAN_PROC_RCVLIST_INV "rcvlist_inv" #define CAN_PROC_RCVLIST_SFF "rcvlist_sff" #define CAN_PROC_RCVLIST_EFF "rcvlist_eff" #define CAN_PROC_RCVLIST_ERR "rcvlist_err" static int user_reset; static const char rx_list_name[][8] = { [RX_ERR] = "rx_err", [RX_ALL] = "rx_all", [RX_FIL] = "rx_fil", [RX_INV] = "rx_inv", }; /* * af_can statistics stuff */ static void can_init_stats(struct net *net) { struct can_pkg_stats *pkg_stats = net->can.pkg_stats; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* * This memset function is called from a timer context (when * can_stattimer is active which is the default) OR in a process * context (reading the proc_fs when can_stattimer is disabled). */ memset(pkg_stats, 0, sizeof(struct can_pkg_stats)); pkg_stats->jiffies_init = jiffies; rcv_lists_stats->stats_reset++; if (user_reset) { user_reset = 0; rcv_lists_stats->user_reset++; } } static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, unsigned long count) { if (oldjif == newjif) return 0; /* see can_stat_update() - this should NEVER happen! */ if (count > (ULONG_MAX / HZ)) { printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n", count); return 99999999; } return (count * HZ) / (newjif - oldjif); } void can_stat_update(struct timer_list *t) { struct net *net = timer_container_of(net, t, can.stattimer); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ long rx_frames = atomic_long_read(&pkg_stats->rx_frames); long tx_frames = atomic_long_read(&pkg_stats->tx_frames); long matches = atomic_long_read(&pkg_stats->matches); long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta); long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta); long matches_delta = atomic_long_read(&pkg_stats->matches_delta); /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); /* restart counting on jiffies overflow */ if (j < pkg_stats->jiffies_init) can_init_stats(net); /* prevent overflow in calc_rate() */ if (rx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ if (tx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ if (matches > (LONG_MAX / 100)) can_init_stats(net); /* calc total values */ if (rx_frames) pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, rx_frames); /* calc current values */ if (rx_frames_delta) pkg_stats->current_rx_match_ratio = (matches_delta * 100) / rx_frames_delta; pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta); pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) pkg_stats->max_tx_rate = pkg_stats->current_tx_rate; if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate) pkg_stats->max_rx_rate = pkg_stats->current_rx_rate; if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ atomic_long_set(&pkg_stats->tx_frames_delta, 0); atomic_long_set(&pkg_stats->rx_frames_delta, 0); atomic_long_set(&pkg_stats->matches_delta, 0); /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } /* * proc read functions */ static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list, struct net_device *dev) { struct receiver *r; hlist_for_each_entry_rcu(r, rx_list, list) { char *fmt = (r->can_id & CAN_EFF_FLAG)? " %-5s %08x %08x %pK %pK %8ld %s\n" : " %-5s %03x %08x %pK %pK %8ld %s\n"; seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask, r->func, r->data, r->matches, r->ident); } } static void can_print_recv_banner(struct seq_file *m) { /* * can1. 00000000 00000000 00000000 * ....... 0 tp20 */ if (IS_ENABLED(CONFIG_64BIT)) seq_puts(m, " device can_id can_mask function userdata matches ident\n"); else seq_puts(m, " device can_id can_mask function userdata matches ident\n"); } static int can_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; struct can_pkg_stats *pkg_stats = net->can.pkg_stats; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); seq_printf(m, " %8ld transmitted frames (TXF)\n", atomic_long_read(&pkg_stats->tx_frames)); seq_printf(m, " %8ld received frames (RXF)\n", atomic_long_read(&pkg_stats->rx_frames)); seq_printf(m, " %8ld matched frames (RXMF)\n", atomic_long_read(&pkg_stats->matches)); seq_putc(m, '\n'); if (net->can.stattimer.function == can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", pkg_stats->total_rx_match_ratio); seq_printf(m, " %8ld frames/s total tx rate (TXR)\n", pkg_stats->total_tx_rate); seq_printf(m, " %8ld frames/s total rx rate (RXR)\n", pkg_stats->total_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% current match ratio (CRXMR)\n", pkg_stats->current_rx_match_ratio); seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n", pkg_stats->current_tx_rate); seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n", pkg_stats->current_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% max match ratio (MRXMR)\n", pkg_stats->max_rx_match_ratio); seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n", pkg_stats->max_tx_rate); seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n", pkg_stats->max_rx_rate); seq_putc(m, '\n'); } seq_printf(m, " %8ld current receive list entries (CRCV)\n", rcv_lists_stats->rcv_entries); seq_printf(m, " %8ld maximum receive list entries (MRCV)\n", rcv_lists_stats->rcv_entries_max); if (rcv_lists_stats->stats_reset) seq_printf(m, "\n %8ld statistic resets (STR)\n", rcv_lists_stats->stats_reset); if (rcv_lists_stats->user_reset) seq_printf(m, " %8ld user statistic resets (USTR)\n", rcv_lists_stats->user_reset); seq_putc(m, '\n'); return 0; } static int can_reset_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_pkg_stats *pkg_stats = net->can.pkg_stats; user_reset = 1; if (net->can.stattimer.function == can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", rcv_lists_stats->stats_reset + 1); } else { if (pkg_stats->jiffies_init != jiffies) can_init_stats(net); seq_printf(m, "Performed statistic reset #%ld.\n", rcv_lists_stats->stats_reset); } return 0; } static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, struct net_device *dev, struct can_dev_rcv_lists *dev_rcv_lists) { if (!hlist_empty(&dev_rcv_lists->rx[idx])) { can_print_recv_banner(m); can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev); } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); } static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); rcu_read_lock(); /* receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists); /* receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) can_rcvlist_proc_show_one(m, idx, dev, &can_ml->dev_rcv_lists); } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } static inline void can_rcvlist_proc_show_array(struct seq_file *m, struct net_device *dev, struct hlist_head *rcv_array, unsigned int rcv_array_sz) { unsigned int i; int all_empty = 1; /* check whether at least one list is non-empty */ for (i = 0; i < rcv_array_sz; i++) if (!hlist_empty(&rcv_array[i])) { all_empty = 0; break; } if (!all_empty) { can_print_recv_banner(m); for (i = 0; i < rcv_array_sz; i++) { if (!hlist_empty(&rcv_array[i])) can_print_rcvlist(m, &rcv_array[i], dev); } } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); } static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_SFF */ seq_puts(m, "\nreceive list 'rx_sff':\n"); rcu_read_lock(); /* sff receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); /* sff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) { dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); } } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_EFF */ seq_puts(m, "\nreceive list 'rx_eff':\n"); rcu_read_lock(); /* eff receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); /* eff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) { dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); } } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } /* * can_init_proc - create main CAN proc directory and procfs entries */ void can_init_proc(struct net *net) { /* create /proc/net/can directory */ net->can.proc_dir = proc_net_mkdir(net, "can", net->proc_net); if (!net->can.proc_dir) { printk(KERN_INFO "can: failed to create /proc/net/can . " "CONFIG_PROC_FS missing?\n"); return; } /* own procfs entries from the AF_CAN core */ net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644, net->can.proc_dir, can_stats_proc_show, NULL); net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS, 0644, net->can.proc_dir, can_reset_stats_proc_show, NULL); net->can.pde_rcvlist_err = proc_create_net_single(CAN_PROC_RCVLIST_ERR, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_ERR); net->can.pde_rcvlist_all = proc_create_net_single(CAN_PROC_RCVLIST_ALL, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_ALL); net->can.pde_rcvlist_fil = proc_create_net_single(CAN_PROC_RCVLIST_FIL, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_FIL); net->can.pde_rcvlist_inv = proc_create_net_single(CAN_PROC_RCVLIST_INV, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_INV); net->can.pde_rcvlist_eff = proc_create_net_single(CAN_PROC_RCVLIST_EFF, 0644, net->can.proc_dir, can_rcvlist_eff_proc_show, NULL); net->can.pde_rcvlist_sff = proc_create_net_single(CAN_PROC_RCVLIST_SFF, 0644, net->can.proc_dir, can_rcvlist_sff_proc_show, NULL); } /* * can_remove_proc - remove procfs entries and main CAN proc directory */ void can_remove_proc(struct net *net) { if (!net->can.proc_dir) return; if (net->can.pde_stats) remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir); if (net->can.pde_reset_stats) remove_proc_entry(CAN_PROC_RESET_STATS, net->can.proc_dir); if (net->can.pde_rcvlist_err) remove_proc_entry(CAN_PROC_RCVLIST_ERR, net->can.proc_dir); if (net->can.pde_rcvlist_all) remove_proc_entry(CAN_PROC_RCVLIST_ALL, net->can.proc_dir); if (net->can.pde_rcvlist_fil) remove_proc_entry(CAN_PROC_RCVLIST_FIL, net->can.proc_dir); if (net->can.pde_rcvlist_inv) remove_proc_entry(CAN_PROC_RCVLIST_INV, net->can.proc_dir); if (net->can.pde_rcvlist_eff) remove_proc_entry(CAN_PROC_RCVLIST_EFF, net->can.proc_dir); if (net->can.pde_rcvlist_sff) remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir); remove_proc_entry("can", net->proc_net); }
3 2 3 1 3 3 3 3 3 2 3 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 // SPDX-License-Identifier: GPL-2.0-only #include <linux/dim.h> #include "netlink.h" #include "common.h" struct coalesce_req_info { struct ethnl_req_info base; }; struct coalesce_reply_data { struct ethnl_reply_data base; struct ethtool_coalesce coalesce; struct kernel_ethtool_coalesce kernel_coalesce; u32 supported_params; }; #define COALESCE_REPDATA(__reply_base) \ container_of(__reply_base, struct coalesce_reply_data, base) #define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS static u32 attr_to_mask(unsigned int attr_type) { return BIT(attr_type - __SUPPORTED_OFFSET); } /* build time check that indices in ethtool_ops::supported_coalesce_params * match corresponding attribute types with an offset */ #define __CHECK_SUPPORTED_OFFSET(x) \ static_assert((ETHTOOL_ ## x) == \ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET)) __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL); const struct nla_policy ethnl_coalesce_get_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; data->supported_params = dev->ethtool_ops->supported_coalesce_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, &data->kernel_coalesce, info->extack); ethnl_ops_complete(dev); return ret; } static int coalesce_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */ nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */ int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */ modersz * NET_DIM_PARAMS_NUM_PROFILES; return nla_total_size(sizeof(u32)) + /* _RX_USECS */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_USECS */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ total_modersz * 2; /* _{R,T}X_PROFILE */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u32(skb, attr_type, val); } static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u8(skb, attr_type, !!val); } /** * coalesce_put_profile - fill reply with a nla nest with four child nla nests. * @skb: socket buffer the message is stored in * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE * @profile: data passed to userspace * @coal_flags: modifiable parameters supported by the driver * * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION. * * Return: 0 on success or a negative error code. */ static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type, const struct dim_cq_moder *profile, u8 coal_flags) { struct nlattr *profile_attr, *moder_attr; int i, ret; if (!profile || !coal_flags) return 0; profile_attr = nla_nest_start(skb, attr_type); if (!profile_attr) return -EMSGSIZE; for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { moder_attr = nla_nest_start(skb, ETHTOOL_A_PROFILE_IRQ_MODERATION); if (!moder_attr) { ret = -EMSGSIZE; goto cancel_profile; } if (coal_flags & DIM_COALESCE_USEC) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC, profile[i].usec); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_PKTS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS, profile[i].pkts); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_COMPS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS, profile[i].comps); if (ret) goto cancel_moder; } nla_nest_end(skb, moder_attr); } nla_nest_end(skb, profile_attr); return 0; cancel_moder: nla_nest_cancel(skb, moder_attr); cancel_profile: nla_nest_cancel(skb, profile_attr); return ret; } static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; struct dim_irq_moder *moder; int ret = 0; if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, coal->rx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES, coal->rx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ, coal->rx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, coal->rx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS, coal->tx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES, coal->tx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ, coal->tx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, coal->tx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, coal->stats_block_coalesce_usecs, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, coal->use_adaptive_rx_coalesce, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, coal->use_adaptive_tx_coalesce, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW, coal->pkt_rate_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW, coal->rx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, coal->rx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW, coal->tx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, coal->tx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH, coal->pkt_rate_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH, coal->rx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, coal->rx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH, coal->tx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, coal->tx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, coal->rate_sample_interval, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, kcoal->use_cqe_mode_rx, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, kcoal->tx_aggr_max_bytes, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, kcoal->tx_aggr_max_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; if (!req_base->dev || !req_base->dev->irq_moder) return 0; moder = req_base->dev->irq_moder; rcu_read_lock(); if (moder->profile_flags & DIM_PROFILE_RX) { ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE, rcu_dereference(moder->rx_profile), moder->coal_flags); if (ret) goto out; } if (moder->profile_flags & DIM_PROFILE_TX) ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE, rcu_dereference(moder->tx_profile), moder->coal_flags); out: rcu_read_unlock(); return ret; } /* COALESCE_SET */ static const struct nla_policy coalesce_irq_moderation_policy[] = { [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 }, }; static const struct nla_policy coalesce_profile_policy[] = { [ETHTOOL_A_PROFILE_IRQ_MODERATION] = NLA_POLICY_NESTED(coalesce_irq_moderation_policy), }; const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), [ETHTOOL_A_COALESCE_TX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), }; static int ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct dim_irq_moder *irq_moder = req_info->dev->irq_moder; struct nlattr **tb = info->attrs; u32 supported_params; u16 a; if (!ops->get_coalesce || !ops->set_coalesce) return -EOPNOTSUPP; /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) supported_params |= ETHTOOL_COALESCE_RX_PROFILE; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) supported_params |= ETHTOOL_COALESCE_TX_PROFILE; for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[a], "cannot modify an unsupported parameter"); return -EINVAL; } return 1; } /** * ethnl_update_irq_moder - update a specific field in the given profile * @irq_moder: place that collects dim related information * @irq_field: field in profile to modify * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_* * @tb: netlink attribute with new values or null * @coal_bit: DIM_COALESCE_* bit from coal_flags * @mod: pointer to bool for modification tracking * @extack: netlink extended ack * * Return: 0 on success or a negative error code. */ static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder, u16 *irq_field, u16 attr_type, struct nlattr **tb, u8 coal_bit, bool *mod, struct netlink_ext_ack *extack) { int ret = 0; u32 val; if (!tb[attr_type]) return 0; if (irq_moder->coal_flags & coal_bit) { val = nla_get_u32(tb[attr_type]); if (*irq_field == val) return 0; *irq_field = val; *mod = true; } else { NL_SET_BAD_ATTR(extack, tb[attr_type]); ret = -EOPNOTSUPP; } return ret; } /** * ethnl_update_profile - get a profile nest with child nests from userspace. * @dev: netdevice to update the profile * @dst: profile get from the driver and modified by ethnl_update_profile. * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile. * @mod: pointer to bool for modification tracking * @extack: Netlink extended ack * * Layout of nests: * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * ... * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * * Return: 0 on success or a negative error code. */ static int ethnl_update_profile(struct net_device *dev, struct dim_cq_moder __rcu **dst, const struct nlattr *nests, bool *mod, struct netlink_ext_ack *extack) { int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy); struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)]; struct dim_irq_moder *irq_moder = dev->irq_moder; struct dim_cq_moder *new_profile, *old_profile; int ret, rem, i = 0, len; struct nlattr *nest; if (!nests) return 0; if (!*dst) return -EOPNOTSUPP; old_profile = rtnl_dereference(*dst); len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile); new_profile = kmemdup(old_profile, len, GFP_KERNEL); if (!new_profile) return -ENOMEM; nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, nests, rem) { ret = nla_parse_nested(tb, len_irq_moder - 1, nest, coalesce_irq_moderation_policy, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec, ETHTOOL_A_IRQ_MODERATION_USEC, tb, DIM_COALESCE_USEC, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts, ETHTOOL_A_IRQ_MODERATION_PKTS, tb, DIM_COALESCE_PKTS, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps, ETHTOOL_A_IRQ_MODERATION_COMPS, tb, DIM_COALESCE_COMPS, mod, extack); if (ret) goto err_out; i++; } /* After the profile is modified, dim itself is a dynamic * mechanism and will quickly fit to the appropriate * coalescing parameters according to the new profile. */ rcu_assign_pointer(*dst, new_profile); kfree_rcu(old_profile, rcu); return 0; err_out: kfree(new_profile); return ret; } static int __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, bool *dual_change) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct net_device *dev = req_info->dev; struct ethtool_coalesce coalesce = {}; bool mod_mode = false, mod = false; struct nlattr **tb = info->attrs; int ret; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); if (ret < 0) return ret; /* Update values */ ethnl_update_u32(&coalesce.rx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_TX_USECS], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.stats_block_coalesce_usecs, tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod); ethnl_update_u32(&coalesce.pkt_rate_low, tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.pkt_rate_high, tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, tb[ETHTOOL_A_COALESCE_RX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) { ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile, tb[ETHTOOL_A_COALESCE_TX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } /* Update operation modes */ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode); ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod_mode); *dual_change = mod && mod_mode; if (!mod && !mod_mode) return 0; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); return ret < 0 ? ret : 1; } static int ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info) { bool dual_change; int err, ret; /* SET_COALESCE may change operation mode and parameters in one call. * Changing operation mode may cause the driver to reset the parameter * values, and therefore ignore user input (driver does not know which * parameters come from user and which are echoed back from ->get). * To not complicate the drivers if user tries to change both the mode * and parameters at once - call the driver twice. */ err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; ret = err; if (ret && dual_change) { err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; } return ret; } const struct ethnl_request_ops ethnl_coalesce_request_ops = { .request_cmd = ETHTOOL_MSG_COALESCE_GET, .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, .hdr_attr = ETHTOOL_A_COALESCE_HEADER, .req_info_size = sizeof(struct coalesce_req_info), .reply_data_size = sizeof(struct coalesce_reply_data), .prepare_data = coalesce_prepare_data, .reply_size = coalesce_reply_size, .fill_reply = coalesce_fill_reply, .set_validate = ethnl_set_coalesce_validate, .set = ethnl_set_coalesce, .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF, };
52 99 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2018 Facebook */ #ifndef _LINUX_BTF_H #define _LINUX_BTF_H 1 #include <linux/types.h> #include <linux/bpfptr.h> #include <linux/bsearch.h> #include <linux/btf_ids.h> #include <uapi/linux/btf.h> #include <uapi/linux/bpf.h> #define BTF_TYPE_EMIT(type) ((void)(type *)0) #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) /* These need to be macros, as the expressions are used in assembler input */ #define KF_ACQUIRE (1 << 0) /* kfunc is an acquire function */ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ /* Trusted arguments are those which are guaranteed to be valid when passed to * the kfunc. It is used to enforce that pointers obtained from either acquire * kfuncs, or from the main kernel on a tracepoint or struct_ops callback * invocation, remain unmodified when being passed to helpers taking trusted * args. * * Consider, for example, the following new task tracepoint: * * SEC("tp_btf/task_newtask") * int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags) * { * ... * } * * And the following kfunc: * * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) * * All invocations to the kfunc must pass the unmodified, unwalked task: * * bpf_task_acquire(task); // Allowed * bpf_task_acquire(task->last_wakee); // Rejected, walked task * * Programs may also pass referenced tasks directly to the kfunc: * * struct task_struct *acquired; * * acquired = bpf_task_acquire(task); // Allowed, same as above * bpf_task_acquire(acquired); // Allowed * bpf_task_acquire(task); // Allowed * bpf_task_acquire(acquired->last_wakee); // Rejected, walked task * * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these * pointers are guaranteed to be safe. For example, the following BPF program * would be rejected: * * SEC("kretprobe/free_task") * int BPF_PROG(free_task_probe, struct task_struct *tsk) * { * struct task_struct *acquired; * * acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer * bpf_task_release(acquired); * * return 0; * } */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ /* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */ #define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ #define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ #define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */ #define KF_FASTCALL (1 << 12) /* kfunc supports bpf_fastcall protocol */ #define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the * amount of copy-paste that kfunc authors have to include for correctness so * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ #define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ __diag_ignore_all("-Wmissing-declarations", \ "Global kfuncs as their definitions will be in BTF");\ __diag_ignore_all("-Wmissing-prototypes", \ "Global kfuncs as their definitions will be in BTF") #define __bpf_kfunc_end_defs() __diag_pop() #define __bpf_hook_start() __bpf_kfunc_start_defs() #define __bpf_hook_end() __bpf_kfunc_end_defs() /* * Return the name of the passed struct, if exists, or halt the build if for * example the structure gets renamed. In this way, developers have to revisit * the code using that structure name, and update it accordingly. */ #define stringify_struct(x) \ ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ __stringify(x); }) struct btf; struct btf_member; struct btf_type; union bpf_attr; struct btf_show; struct btf_id_set; struct bpf_prog; typedef int (*btf_kfunc_filter_t)(const struct bpf_prog *prog, u32 kfunc_id); struct btf_kfunc_id_set { struct module *owner; struct btf_id_set8 *set; btf_kfunc_filter_t filter; }; struct btf_id_dtor_kfunc { u32 btf_id; u32 kfunc_btf_id; }; struct btf_struct_meta { u32 btf_id; struct btf_record *record; }; struct btf_struct_metas { u32 cnt; struct btf_struct_meta types[]; }; extern const struct file_operations btf_fops; const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); const struct btf_header *btf_header(const struct btf *btf); int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * * For example: * In describing "const void *", type_id is "const" and "const" * refers to "void *". The return type will be "void *". * * If type_id is a simple "int", then return type will be "int". * * @btf: struct btf object * @type_id: Find out the size of type_id. The type_id of the return * type is set to *type_id. * @ret_size: It can be NULL. If not NULL, the size of the return * type is set to *ret_size. * Return: The btf_type (resolved to another type with size info if needed). * NULL is returned if type_id itself does not have size info * (e.g. void) or it cannot be resolved to another type that * has size info. * *type_id and *ret_size will not be changed in the * NULL return case. */ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); /* * Options to control show behaviour. * - BTF_SHOW_COMPACT: no formatting around type information * - BTF_SHOW_NONAME: no struct/union member names/types * - BTF_SHOW_PTR_RAW: show raw (unobfuscated) pointer values; * equivalent to %px. * - BTF_SHOW_ZERO: show zero-valued struct/union members; they * are not displayed by default * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read * data before displaying it. */ #define BTF_SHOW_COMPACT BTF_F_COMPACT #define BTF_SHOW_NONAME BTF_F_NONAME #define BTF_SHOW_PTR_RAW BTF_F_PTR_RAW #define BTF_SHOW_ZERO BTF_F_ZERO #define BTF_SHOW_UNSAFE (1ULL << 4) void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m, u64 flags); /* * Copy len bytes of string representation of obj of BTF type_id into buf. * * @btf: struct btf object * @type_id: type id of type obj points to * @obj: pointer to typed data * @buf: buffer to write to * @len: maximum length to write to buf * @flags: show options (see above) * * Return: length that would have been/was copied as per snprintf, or * negative error. */ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags); int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); bool btf_is_kernel(const struct btf *btf); bool btf_is_module(const struct btf *btf); bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); bool btf_type_is_primitive(const struct btf_type *t); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size); const char *btf_type_str(const struct btf_type *t); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ i < btf_type_vlen(struct_type); \ i++, member++) #define for_each_vsi(i, datasec_type, member) \ for (i = 0, member = btf_type_var_secinfo(datasec_type); \ i < btf_type_vlen(datasec_type); \ i++, member++) static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; } static inline bool btf_type_is_int(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } static inline bool btf_type_is_small_int(const struct btf_type *t) { return btf_type_is_int(t) && t->size <= sizeof(u64); } static inline u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(u32 *)(t + 1)); } static inline bool btf_type_is_signed_int(const struct btf_type *t) { return btf_type_is_int(t) && (btf_int_encoding(t) & BTF_INT_SIGNED); } static inline bool btf_type_is_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } static inline bool btf_is_any_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM || BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64; } static inline bool btf_kind_core_compat(const struct btf_type *t1, const struct btf_type *t2) { return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) || (btf_is_any_enum(t1) && btf_is_any_enum(t2)); } static inline bool str_is_empty(const char *s) { return !s || !s[0]; } static inline u16 btf_kind(const struct btf_type *t) { return BTF_INFO_KIND(t->info); } static inline bool btf_is_enum(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_ENUM; } static inline bool btf_is_enum64(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_ENUM64; } static inline u64 btf_enum64_value(const struct btf_enum64 *e) { return ((u64)e->val_hi32 << 32) | e->val_lo32; } static inline bool btf_is_composite(const struct btf_type *t) { u16 kind = btf_kind(t); return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } static inline bool btf_is_array(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_ARRAY; } static inline bool btf_is_int(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_INT; } static inline bool btf_is_ptr(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_PTR; } static inline u8 btf_int_offset(const struct btf_type *t) { return BTF_INT_OFFSET(*(u32 *)(t + 1)); } static inline __u8 btf_int_bits(const struct btf_type *t) { return BTF_INT_BITS(*(__u32 *)(t + 1)); } static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); } static inline bool btf_type_is_fwd(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } static inline bool btf_type_is_typedef(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; } static inline bool btf_type_is_volatile(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE; } static inline bool btf_type_is_func(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; } static inline bool btf_type_is_func_proto(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } static inline bool btf_type_is_var(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; } static inline bool btf_type_is_type_tag(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG; } /* union is only a special case of struct: * all its offsetof(member) == 0 */ static inline bool btf_type_is_struct(const struct btf_type *t) { u8 kind = BTF_INFO_KIND(t->info); return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } static inline bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; } static inline bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } static inline u16 btf_vlen(const struct btf_type *t) { return btf_type_vlen(t); } static inline u16 btf_func_linkage(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } static inline bool btf_type_kflag(const struct btf_type *t) { return BTF_INFO_KFLAG(t->info); } static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type, const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) : member->offset; } static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) : 0; } static inline struct btf_member *btf_members(const struct btf_type *t) { return (struct btf_member *)(t + 1); } static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx) { const struct btf_member *m = btf_members(t) + member_idx; return __btf_member_bit_offset(t, m); } static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx) { const struct btf_member *m = btf_members(t) + member_idx; return __btf_member_bitfield_size(t, m); } static inline const struct btf_member *btf_type_member(const struct btf_type *t) { return (const struct btf_member *)(t + 1); } static inline struct btf_array *btf_array(const struct btf_type *t) { return (struct btf_array *)(t + 1); } static inline struct btf_enum *btf_enum(const struct btf_type *t) { return (struct btf_enum *)(t + 1); } static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) { return (struct btf_enum64 *)(t + 1); } static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { return (const struct btf_var_secinfo *)(t + 1); } static inline struct btf_param *btf_params(const struct btf_type *t) { return (struct btf_param *)(t + 1); } static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t) { return (struct btf_decl_tag *)(t + 1); } static inline int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; return *pa - *pb; } static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) { return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } bool btf_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) struct bpf_struct_ops; int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops); const struct bpf_struct_ops_desc *bpf_struct_ops_find_value(struct btf *btf, u32 value_id); const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id); #else static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id) { return NULL; } #endif enum btf_field_iter_kind { BTF_FIELD_ITER_IDS, BTF_FIELD_ITER_STRS, }; struct btf_field_desc { /* once-per-type offsets */ int t_off_cnt, t_offs[2]; /* member struct size, or zero, if no members */ int m_sz; /* repeated per-member offsets */ int m_off_cnt, m_offs[1]; }; struct btf_field_iter { struct btf_field_desc desc; void *p; int m_idx; int off_idx; int vlen; }; #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); void btf_set_base_btf(struct btf *btf, const struct btf *base_btf); int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **map_ids); int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, enum btf_field_iter_kind iter_kind); __u32 *btf_field_iter_next(struct btf_field_iter *it); const char *btf_name_by_offset(const struct btf *btf, u32 offset); const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); bool btf_is_projection_of(const char *pname, const char *tname); bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx); static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) { if (!btf_type_is_ptr(t)) return false; t = btf_type_skip_modifiers(btf, t->type, NULL); return btf_type_is_struct(t); } #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { return NULL; } static inline void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) { } static inline int btf_relocate(void *log, struct btf *btf, const struct btf *base_btf, __u32 **map_ids) { return -EOPNOTSUPP; } static inline int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, enum btf_field_iter_kind iter_kind) { return -EOPNOTSUPP; } static inline __u32 *btf_field_iter_next(struct btf_field_iter *it) { return NULL; } static inline const char *btf_name_by_offset(const struct btf *btf, u32 offset) { return NULL; } static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, struct bpf_prog *prog) { return NULL; } static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s) { return 0; } static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { return -ENOENT; } static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner) { return 0; } static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) { return NULL; } static inline bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { return false; } static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) { return -EINVAL; } static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { return false; } static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) { return -EOPNOTSUPP; } #endif #endif
6 6 43 52 8 17 46 43 43 43 3 3 52 52 13 4 4 9 9 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_fifo.c The simplest FIFO queue. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> /* 1 band FIFO pseudo-"scheduler" */ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); } static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); } static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int prev_backlog; if (unlikely(READ_ONCE(sch->limit) == 0)) return qdisc_drop(skb, sch, to_free); if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; /* queue full, remove one skb to fulfill the limit */ __qdisc_queue_drop_head(sch, &sch->q, to_free); qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); qdisc_tree_reduce_backlog(sch, 0, prev_backlog - sch->qstats.backlog); return NET_XMIT_CN; } static void fifo_offload_init(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_fifo_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_FIFO_REPLACE; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); } static void fifo_offload_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_fifo_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_FIFO_DESTROY; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); } static int fifo_offload_dump(struct Qdisc *sch) { struct tc_fifo_qopt_offload qopt; qopt.command = TC_FIFO_STATS; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.stats.bstats = &sch->bstats; qopt.stats.qstats = &sch->qstats; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt); } static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { bool bypass; bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { u32 limit = qdisc_dev(sch)->tx_queue_len; if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); WRITE_ONCE(sch->limit, limit); } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; WRITE_ONCE(sch->limit, ctl->limit); } if (is_bfifo) bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); else bypass = sch->limit >= 1; if (bypass) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int fifo_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { int err; err = __fifo_init(sch, opt, extack); if (err) return err; fifo_offload_init(sch); return 0; } static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { return __fifo_init(sch, opt, extack); } static void fifo_destroy(struct Qdisc *sch) { fifo_offload_destroy(sch); } static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) }; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { int err; err = fifo_offload_dump(sch); if (err) return err; return __fifo_dump(sch, skb); } static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb) { return __fifo_dump(sch, skb); } struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", .priv_size = 0, .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, .owner = THIS_MODULE, }; EXPORT_SYMBOL(pfifo_qdisc_ops); struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .id = "bfifo", .priv_size = 0, .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, .owner = THIS_MODULE, }; EXPORT_SYMBOL(bfifo_qdisc_ops); struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .id = "pfifo_head_drop", .priv_size = 0, .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_hd_init, .reset = qdisc_reset_queue, .change = fifo_hd_init, .dump = fifo_hd_dump, .owner = THIS_MODULE, }; /* Pass size change message down to embedded FIFO */ int fifo_set_limit(struct Qdisc *q, unsigned int limit) { struct nlattr *nla; int ret = -ENOMEM; /* Hack to avoid sending change message to non-FIFO */ if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; if (!q->ops->change) return 0; nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; ret = q->ops->change(q, nla, NULL); kfree(nla); } return ret; } EXPORT_SYMBOL(fifo_set_limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack) { struct Qdisc *q; int err = -ENOMEM; q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1), extack); if (q) { err = fifo_set_limit(q, limit); if (err < 0) { qdisc_put(q); q = NULL; } } return q ? : ERR_PTR(err); } EXPORT_SYMBOL(fifo_create_dflt); MODULE_DESCRIPTION("Single queue packet and byte based First In First Out(P/BFIFO) scheduler");
5 5 5 105 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0-only /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/export.h> #include <linux/module.h> #include <linux/types.h> #include "crc32table.h" static inline u32 __maybe_unused crc32_le_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; return crc; } static inline u32 __maybe_unused crc32_be_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; return crc; } static inline u32 __maybe_unused crc32c_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; return crc; } #ifdef CONFIG_CRC32_ARCH #include "crc32.h" /* $(SRCARCH)/crc32.h */ u32 crc32_optimizations(void) { return crc32_optimizations_arch(); } EXPORT_SYMBOL(crc32_optimizations); #else #define crc32_le_arch crc32_le_base #define crc32_be_arch crc32_be_base #define crc32c_arch crc32c_base #endif u32 crc32_le(u32 crc, const void *p, size_t len) { return crc32_le_arch(crc, p, len); } EXPORT_SYMBOL(crc32_le); u32 crc32_be(u32 crc, const void *p, size_t len) { return crc32_be_arch(crc, p, len); } EXPORT_SYMBOL(crc32_be); u32 crc32c(u32 crc, const void *p, size_t len) { return crc32c_arch(crc, p, len); } EXPORT_SYMBOL(crc32c); #ifdef crc32_mod_init_arch static int __init crc32_mod_init(void) { crc32_mod_init_arch(); return 0; } subsys_initcall(crc32_mod_init); static void __exit crc32_mod_exit(void) { } module_exit(crc32_mod_exit); #endif MODULE_DESCRIPTION("CRC32 library functions"); MODULE_LICENSE("GPL");
26 42 13 27 2 4 10 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * * Todo: * - Tunable compression parameters. * - Compression stats. * - Adaptive compression. */ #include <linux/module.h> #include <linux/err.h> #include <linux/rtnetlink.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> #include <net/protocol.h> #include <net/sock.h> static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP); else ipv4_redirect(skb, net, 0, IPPROTO_COMP); xfrm_state_put(x); return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); struct xfrm_state *t; t = xfrm_state_alloc(net); if (!t) goto out; lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; atomic_set(&t->tunnel_users, 1); out: return t; error: t->km.state = XFRM_STATE_DEAD; xfrm_state_put(t); t = NULL; goto out; } /* * Must be protected by xfrm_cfg_mutex. State and tunnel user references are * always incremented on success. */ static int ipcomp_tunnel_attach(struct xfrm_state *x) { struct net *net = xs_net(x); int err = 0; struct xfrm_state *t; u32 mark = x->mark.v & x->mark.m; t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); if (!t) { err = -EINVAL; goto out; } xfrm_state_insert(t); xfrm_state_hold(t); } x->tunnel = t; atomic_inc(&t->tunnel_users); out: return err; } static int ipcomp4_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err = -EINVAL; x->props.header_len = 0; switch (x->props.mode) { case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct iphdr); break; default: NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp"); goto out; } err = ipcomp_init_state(x, extack); if (err) goto out; if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) { NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state"); goto out; } } err = 0; out: return err; } static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ipcomp_type = { .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp4_init_state, .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output }; static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, .priority = 0, }; static int __init ipcomp4_init(void) { if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ipcomp4_fini(void) { if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); } module_init(ipcomp4_init); module_exit(ipcomp4_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
85 85 19 44 3 1 35 5 2 7 21 10 2 4 2 2 2 16 3 5 5 5 1 5 5 5 3 5 5 12 5 4 3 2 2 3 3 6 2 2 4 1 1 32 2 1 29 17 3 1 6 1 2 2 1 1 13 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static unsigned int nfct_timeout_id __read_mostly; struct ctnl_timeout { struct list_head head; struct list_head free_head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; /* must be at the end */ struct nf_ct_timeout timeout; }; struct nfct_timeout_pernet { struct list_head nfct_timeout_list; struct list_head nfct_timeout_freelist; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, .len = CTNL_TIMEOUT_NAME_MAX - 1}, [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) { return net_generic(net, nfct_timeout_id); } static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); return ret; } static int cttimeout_new_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; char *name; int ret; if (!cda[CTA_TIMEOUT_NAME] || !cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->timeout.l3num != l3num || matching->timeout.l4proto->l4proto != l4num) return -EINVAL; return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, info->net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; } l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct ctnl_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); __module_get(THIS_MODULE); list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: kfree(timeout); err_proto_put: return ret; } static int ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; if (cb->args[2]) return 0; last = (struct ctnl_timeout *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); pernet = nfct_timeout_pernet(net); list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int cttimeout_get_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; /* We want to avoid races with ctnl_timeout_put. So only when the * current refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; } return ret; } static int cttimeout_del_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) ctnl_timeout_try_del(info->net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; break; } return ret; } static int cttimeout_default_set(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err; } ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; return 0; err: return ret; } static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, const struct nf_conntrack_l4proto *l4proto, const unsigned int *timeouts) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int cttimeout_default_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; __u16 l3num; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } if (!timeouts) return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (!refcount_inc_not_zero(&timeout->refcnt)) goto err; matching = timeout; break; } err: return matching ? &matching->timeout : NULL; } static void ctnl_timeout_put(struct nf_ct_timeout *t) { struct ctnl_timeout *timeout = container_of(t, struct ctnl_timeout, timeout); if (refcount_dec_and_test(&timeout->refcnt)) { kfree_rcu(timeout, rcu_head); module_put(THIS_MODULE); } } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { .call = cttimeout_default_set, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { .call = cttimeout_default_get, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { .name = "conntrack_timeout", .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, .cb_count = IPCTNL_MSG_TIMEOUT_MAX, .cb = cttimeout_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); INIT_LIST_HEAD(&pernet->nfct_timeout_list); INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } static void __net_exit cttimeout_net_pre_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { list_del_rcu(&cur->head); list_add(&cur->free_head, &pernet->nfct_timeout_freelist); } /* core calls synchronize_rcu() after this */ } static void __net_exit cttimeout_net_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; if (list_empty(&pernet->nfct_timeout_freelist)) return; nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, .id = &nfct_timeout_id, .size = sizeof(struct nfct_timeout_pernet), }; static const struct nf_ct_timeout_hooks hooks = { .timeout_find_get = ctnl_timeout_find_get, .timeout_put = ctnl_timeout_put, }; static int __init cttimeout_init(void) { int ret; ret = register_pernet_subsys(&cttimeout_ops); if (ret < 0) return ret; ret = nfnetlink_subsys_register(&cttimeout_subsys); if (ret < 0) { pr_err("cttimeout_init: cannot register cttimeout with " "nfnetlink.\n"); goto err_out; } RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks); return 0; err_out: unregister_pernet_subsys(&cttimeout_ops); return ret; } static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) RCU_INIT_POINTER(timeout_ext->timeout, NULL); return 0; } static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); module_exit(cttimeout_exit);
1432 1432 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bits.h> #include <linux/bug.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include "access.h" #include "audit.h" #include "domain.h" #include "limits.h" #include "object.h" #include "ruleset.h" static struct landlock_ruleset *create_ruleset(const u32 num_layers) { struct landlock_ruleset *new_ruleset; new_ruleset = kzalloc(struct_size(new_ruleset, access_masks, num_layers), GFP_KERNEL_ACCOUNT); if (!new_ruleset) return ERR_PTR(-ENOMEM); refcount_set(&new_ruleset->usage, 1); mutex_init(&new_ruleset->lock); new_ruleset->root_inode = RB_ROOT; #if IS_ENABLED(CONFIG_INET) new_ruleset->root_net_port = RB_ROOT; #endif /* IS_ENABLED(CONFIG_INET) */ new_ruleset->num_layers = num_layers; /* * hierarchy = NULL * num_rules = 0 * access_masks[] = 0 */ return new_ruleset; } struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask, const access_mask_t net_access_mask, const access_mask_t scope_mask) { struct landlock_ruleset *new_ruleset; /* Informs about useless ruleset. */ if (!fs_access_mask && !net_access_mask && !scope_mask) return ERR_PTR(-ENOMSG); new_ruleset = create_ruleset(1); if (IS_ERR(new_ruleset)) return new_ruleset; if (fs_access_mask) landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0); if (net_access_mask) landlock_add_net_access_mask(new_ruleset, net_access_mask, 0); if (scope_mask) landlock_add_scope_mask(new_ruleset, scope_mask, 0); return new_ruleset; } static void build_check_rule(void) { const struct landlock_rule rule = { .num_layers = ~0, }; BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS); } static bool is_object_pointer(const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return true; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return false; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return false; } } static struct landlock_rule * create_rule(const struct landlock_id id, const struct landlock_layer (*const layers)[], const u32 num_layers, const struct landlock_layer *const new_layer) { struct landlock_rule *new_rule; u32 new_num_layers; build_check_rule(); if (new_layer) { /* Should already be checked by landlock_merge_ruleset(). */ if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS)) return ERR_PTR(-E2BIG); new_num_layers = num_layers + 1; } else { new_num_layers = num_layers; } new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers), GFP_KERNEL_ACCOUNT); if (!new_rule) return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { /* This should have been caught by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } new_rule->key = id.key; new_rule->num_layers = new_num_layers; /* Copies the original layer stack. */ memcpy(new_rule->layers, layers, flex_array_size(new_rule, layers, num_layers)); if (new_layer) /* Adds a copy of @new_layer on the layer stack. */ new_rule->layers[new_rule->num_layers - 1] = *new_layer; return new_rule; } static struct rb_root *get_root(struct landlock_ruleset *const ruleset, const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return &ruleset->root_inode; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return &ruleset->root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } static void free_rule(struct landlock_rule *const rule, const enum landlock_key_type key_type) { might_sleep(); if (!rule) return; if (is_object_pointer(key_type)) landlock_put_object(rule->key.object); kfree(rule); } static void build_check_ruleset(void) { const struct landlock_ruleset ruleset = { .num_rules = ~0, .num_layers = ~0, }; BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES); BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS); } /** * insert_rule - Create and insert a rule in a ruleset * * @ruleset: The ruleset to be updated. * @id: The ID to build the new rule with. The underlying kernel object, if * any, must be held by the caller. * @layers: One or multiple layers to be copied into the new rule. * @num_layers: The number of @layers entries. * * When user space requests to add a new rule to a ruleset, @layers only * contains one entry and this entry is not assigned to any level. In this * case, the new rule will extend @ruleset, similarly to a boolean OR between * access rights. * * When merging a ruleset in a domain, or copying a domain, @layers will be * added to @ruleset as new constraints, similarly to a boolean AND between * access rights. */ static int insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const struct landlock_layer (*const layers)[], const size_t num_layers) { struct rb_node **walker_node; struct rb_node *parent_node = NULL; struct landlock_rule *new_rule; struct rb_root *root; might_sleep(); lockdep_assert_held(&ruleset->lock); if (WARN_ON_ONCE(!layers)) return -ENOENT; if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object)) return -ENOENT; root = get_root(ruleset, id.type); if (IS_ERR(root)) return PTR_ERR(root); walker_node = &root->rb_node; while (*walker_node) { struct landlock_rule *const this = rb_entry(*walker_node, struct landlock_rule, node); if (this->key.data != id.key.data) { parent_node = *walker_node; if (this->key.data < id.key.data) walker_node = &((*walker_node)->rb_right); else walker_node = &((*walker_node)->rb_left); continue; } /* Only a single-level layer should match an existing rule. */ if (WARN_ON_ONCE(num_layers != 1)) return -EINVAL; /* If there is a matching rule, updates it. */ if ((*layers)[0].level == 0) { /* * Extends access rights when the request comes from * landlock_add_rule(2), i.e. @ruleset is not a domain. */ if (WARN_ON_ONCE(this->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(this->layers[0].level != 0)) return -EINVAL; this->layers[0].access |= (*layers)[0].access; return 0; } if (WARN_ON_ONCE(this->layers[0].level == 0)) return -EINVAL; /* * Intersects access rights when it is a merge between a * ruleset and a domain. */ new_rule = create_rule(id, &this->layers, this->num_layers, &(*layers)[0]); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_replace_node(&this->node, &new_rule->node, root); free_rule(this, id.type); return 0; } /* There is no match for @id. */ build_check_ruleset(); if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES) return -E2BIG; new_rule = create_rule(id, layers, num_layers, NULL); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_link_node(&new_rule->node, parent_node, walker_node); rb_insert_color(&new_rule->node, root); ruleset->num_rules++; return 0; } static void build_check_layer(void) { const struct landlock_layer layer = { .level = ~0, .access = ~0, }; BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS); } /* @ruleset must be locked by the caller. */ int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access) { struct landlock_layer layers[] = { { .access = access, /* When @level is zero, insert_rule() extends @ruleset. */ .level = 0, } }; build_check_layer(); return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers)); } static int merge_tree(struct landlock_ruleset *const dst, struct landlock_ruleset *const src, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *src_root; int err = 0; might_sleep(); lockdep_assert_held(&dst->lock); lockdep_assert_held(&src->lock); src_root = get_root(src, key_type); if (IS_ERR(src_root)) return PTR_ERR(src_root); /* Merges the @src tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root, node) { struct landlock_layer layers[] = { { .level = dst->num_layers, } }; const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; if (WARN_ON_ONCE(walker_rule->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) return -EINVAL; layers[0].access = walker_rule->layers[0].access; err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers)); if (err) return err; } return err; } static int merge_ruleset(struct landlock_ruleset *const dst, struct landlock_ruleset *const src) { int err = 0; might_sleep(); /* Should already be checked by landlock_merge_ruleset() */ if (WARN_ON_ONCE(!src)) return 0; /* Only merge into a domain. */ if (WARN_ON_ONCE(!dst || !dst->hierarchy)) return -EINVAL; /* Locks @dst first because we are its only owner. */ mutex_lock(&dst->lock); mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING); /* Stacks the new layer. */ if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) { err = -EINVAL; goto out_unlock; } dst->access_masks[dst->num_layers - 1] = landlock_upgrade_handled_access_masks(src->access_masks[0]); /* Merges the @src inode tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Merges the @src network port tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ out_unlock: mutex_unlock(&src->lock); mutex_unlock(&dst->lock); return err; } static int inherit_tree(struct landlock_ruleset *const parent, struct landlock_ruleset *const child, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *parent_root; int err = 0; might_sleep(); lockdep_assert_held(&parent->lock); lockdep_assert_held(&child->lock); parent_root = get_root(parent, key_type); if (IS_ERR(parent_root)) return PTR_ERR(parent_root); /* Copies the @parent inode or network tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, parent_root, node) { const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; err = insert_rule(child, id, &walker_rule->layers, walker_rule->num_layers); if (err) return err; } return err; } static int inherit_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const child) { int err = 0; might_sleep(); if (!parent) return 0; /* Locks @child first because we are its only owner. */ mutex_lock(&child->lock); mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); /* Copies the @parent inode tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Copies the @parent network port tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) { err = -EINVAL; goto out_unlock; } /* Copies the parent layer stack and leaves a space for the new layer. */ memcpy(child->access_masks, parent->access_masks, flex_array_size(parent, access_masks, parent->num_layers)); if (WARN_ON_ONCE(!parent->hierarchy)) { err = -EINVAL; goto out_unlock; } landlock_get_hierarchy(parent->hierarchy); child->hierarchy->parent = parent->hierarchy; out_unlock: mutex_unlock(&parent->lock); mutex_unlock(&child->lock); return err; } static void free_ruleset(struct landlock_ruleset *const ruleset) { struct landlock_rule *freeme, *next; might_sleep(); rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode, node) free_rule(freeme, LANDLOCK_KEY_INODE); #if IS_ENABLED(CONFIG_INET) rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_net_port, node) free_rule(freeme, LANDLOCK_KEY_NET_PORT); #endif /* IS_ENABLED(CONFIG_INET) */ landlock_put_hierarchy(ruleset->hierarchy); kfree(ruleset); } void landlock_put_ruleset(struct landlock_ruleset *const ruleset) { might_sleep(); if (ruleset && refcount_dec_and_test(&ruleset->usage)) free_ruleset(ruleset); } static void free_ruleset_work(struct work_struct *const work) { struct landlock_ruleset *ruleset; ruleset = container_of(work, struct landlock_ruleset, work_free); free_ruleset(ruleset); } /* Only called by hook_cred_free(). */ void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset) { if (ruleset && refcount_dec_and_test(&ruleset->usage)) { INIT_WORK(&ruleset->work_free, free_ruleset_work); schedule_work(&ruleset->work_free); } } /** * landlock_merge_ruleset - Merge a ruleset with a domain * * @parent: Parent domain. * @ruleset: New ruleset to be merged. * * The current task is requesting to be restricted. The subjective credentials * must not be in an overridden state. cf. landlock_init_hierarchy_log(). * * Returns the intersection of @parent and @ruleset, or returns @parent if * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty. */ struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset) { struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL; u32 num_layers; int err; might_sleep(); if (WARN_ON_ONCE(!ruleset || parent == ruleset)) return ERR_PTR(-EINVAL); if (parent) { if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS) return ERR_PTR(-E2BIG); num_layers = parent->num_layers + 1; } else { num_layers = 1; } /* Creates a new domain... */ new_dom = create_ruleset(num_layers); if (IS_ERR(new_dom)) return new_dom; new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT); if (!new_dom->hierarchy) return ERR_PTR(-ENOMEM); refcount_set(&new_dom->hierarchy->usage, 1); /* ...as a child of @parent... */ err = inherit_ruleset(parent, new_dom); if (err) return ERR_PTR(err); /* ...and including @ruleset. */ err = merge_ruleset(new_dom, ruleset); if (err) return ERR_PTR(err); err = landlock_init_hierarchy_log(new_dom->hierarchy); if (err) return ERR_PTR(err); return no_free_ptr(new_dom); } /* * The returned access has the same lifetime as @ruleset. */ const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id) { const struct rb_root *root; const struct rb_node *node; root = get_root((struct landlock_ruleset *)ruleset, id.type); if (IS_ERR(root)) return NULL; node = root->rb_node; while (node) { struct landlock_rule *this = rb_entry(node, struct landlock_rule, node); if (this->key.data == id.key.data) return this; if (this->key.data < id.key.data) node = node->rb_right; else node = node->rb_left; } return NULL; } /* * @layer_masks is read and may be updated according to the access request and * the matching rule. * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks). * * Returns true if the request is allowed (i.e. relevant layer masks for the * request are empty). */ bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size) { size_t layer_level; if (!access_request || !layer_masks) return true; if (!rule) return false; /* * An access is granted if, for each policy layer, at least one rule * encountered on the pathwalk grants the requested access, * regardless of its position in the layer stack. We must then check * the remaining layers for each inode, from the first added layer to * the last one. When there is multiple requested accesses, for each * policy layer, the full set of requested accesses may not be granted * by only one rule, but by the union (binary OR) of multiple rules. * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> */ for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { const struct landlock_layer *const layer = &rule->layers[layer_level]; const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); const unsigned long access_req = access_request; unsigned long access_bit; bool is_empty; /* * Records in @layer_masks which layer grants access to each * requested access. */ is_empty = true; for_each_set_bit(access_bit, &access_req, masks_array_size) { if (layer->access & BIT_ULL(access_bit)) (*layer_masks)[access_bit] &= ~layer_bit; is_empty = is_empty && !(*layer_masks)[access_bit]; } if (is_empty) return true; } return false; } typedef access_mask_t get_access_mask_t(const struct landlock_ruleset *const ruleset, const u16 layer_level); /** * landlock_init_layer_masks - Initialize layer masks from an access request * * Populates @layer_masks such that for each access right in @access_request, * the bits for all the layers are set where this access right is handled. * * @domain: The domain that defines the current restrictions. * @access_request: The requested access rights to check. * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type. * @key_type: The key type to switch between access masks of different types. * * Returns: An access mask where each access right bit is set which is handled * in any of the active layers in @domain. */ access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type) { access_mask_t handled_accesses = 0; size_t layer_level, num_access; get_access_mask_t *get_access_mask; switch (key_type) { case LANDLOCK_KEY_INODE: get_access_mask = landlock_get_fs_access_mask; num_access = LANDLOCK_NUM_ACCESS_FS; break; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: get_access_mask = landlock_get_net_access_mask; num_access = LANDLOCK_NUM_ACCESS_NET; break; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return 0; } memset(layer_masks, 0, array_size(sizeof((*layer_masks)[0]), num_access)); /* An empty access request can happen because of O_WRONLY | O_RDWR. */ if (!access_request) return 0; /* Saves all handled accesses per layer. */ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { const unsigned long access_req = access_request; const access_mask_t access_mask = get_access_mask(domain, layer_level); unsigned long access_bit; for_each_set_bit(access_bit, &access_req, num_access) { if (BIT_ULL(access_bit) & access_mask) { (*layer_masks)[access_bit] |= BIT_ULL(layer_level); handled_accesses |= BIT_ULL(access_bit); } } } return handled_accesses; }
19 19 265 265 320 165 166 162 166 166 166 166 166 166 291 292 265 264 265 265 2 263 288 292 19 19 292 26 292 287 287 288 18 10 10 287 25 287 292 287 288 287 286 10 30 30 30 292 26 26 26 26 26 25 19 19 25 25 292 287 30 30 29 30 292 292 292 292 23 23 23 23 23 23 18 23 23 23 23 23 23 23 23 23 1 14 9 23 23 23 1 23 23 23 23 8 8 23 23 23 8 15 18 18 15 8 18 22 23 23 8 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions implement the sctp_outq class. The outqueue handles * bundling and queueing of outgoing SCTP chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Perry Melange <pmelange@null.cc.uic.edu> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/slab.h> #include <net/sock.h> /* For skb_set_owner_w */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> #include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn); static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn, int count_of_newacks); static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { return q->sched->dequeue(q); } /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } /* * SFR-CACC algorithm: * D) If count_of_newacks is greater than or equal to 2 * and t was not sent to the current primary then the * sender MUST NOT increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks >= 2 && transport != primary) return 1; return 0; } /* * SFR-CACC algorithm: * F) If count_of_newacks is less than 2, let d be the * destination to which t was sent. If cacc_saw_newack * is 0 for destination d, then the sender MUST NOT * increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks < 2 && (transport && !transport->cacc.cacc_saw_newack)) return 1; return 0; } /* * SFR-CACC algorithm: * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD * execute steps C, D, F. * * C has been implemented in sctp_outq_sack */ static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (!primary->cacc.cycling_changeover) { if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) return 1; if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) return 1; return 0; } return 0; } /* * SFR-CACC algorithm: * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less * than next_tsn_at_change of the current primary, then * the sender MUST NOT increment missing report count * for t. */ static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) { if (primary->cacc.cycling_changeover && TSN_lt(tsn, primary->cacc.next_tsn_at_change)) return 1; return 0; } /* * SFR-CACC algorithm: * 3) If the missing report count for TSN t is to be * incremented according to [RFC2960] and * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, * then the sender MUST further execute steps 3.1 and * 3.2 to determine if the missing report count for * TSN t SHOULD NOT be incremented. * * 3.3) If 3.1 and 3.2 do not dictate that the missing * report count for t should not be incremented, then * the sender SHOULD increment missing report count for * t (according to [RFC2960] and [SCTP_STEWART_2002]). */ static inline int sctp_cacc_skip(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks, __u32 tsn) { if (primary->cacc.changeover_active && (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) || sctp_cacc_skip_3_2(primary, tsn))) return 1; return 0; } /* Initialize an existing sctp_outq. This does the boring stuff. * You still need to define handlers if you really want to DO * something with this structure... */ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) { memset(q, 0, sizeof(struct sctp_outq)); q->asoc = asoc; INIT_LIST_HEAD(&q->out_chunk_list); INIT_LIST_HEAD(&q->control_chunk_list); INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); sctp_sched_set_sched(asoc, sctp_sk(asoc->base.sk)->default_ss); } /* Free the outqueue structure and any related pending chunks. */ static void __sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *temp; struct sctp_chunk *chunk, *tmp; /* Throw away unacknowledged chunks. */ list_for_each_entry(transport, &q->asoc->peer.transport_addr_list, transports) { while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* Mark as part of a failed message. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } } /* Throw away chunks that have been gap ACKed. */ list_for_each_safe(lchunk, temp, &q->sacked) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks in the retransmit queue. */ list_for_each_safe(lchunk, temp, &q->retransmit) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks that are in the abandoned queue. */ list_for_each_safe(lchunk, temp, &q->abandoned) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover control chunks. */ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } void sctp_outq_teardown(struct sctp_outq *q) { __sctp_outq_teardown(q); sctp_outq_init(q->asoc, q); } /* Free the outqueue structure and any related pending chunks. */ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ __sctp_outq_teardown(q); } /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); /* If it is data, queue it up, otherwise, send it * immediately. */ if (sctp_chunk_is_data(chunk)) { pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); sctp_outq_tail_data(q, chunk); if (chunk->asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); } if (!q->cork) sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list * and the abandoned list are in ascending order. */ static void sctp_insert_list(struct list_head *head, struct list_head *new) { struct list_head *pos; struct sctp_chunk *nchunk, *lchunk; __u32 ntsn, ltsn; int done = 0; nchunk = list_entry(new, struct sctp_chunk, transmitted_list); ntsn = ntohl(nchunk->subh.data_hdr->tsn); list_for_each(pos, head) { lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); ltsn = ntohl(lchunk->subh.data_hdr->tsn); if (TSN_lt(ntsn, ltsn)) { list_add(new, pos->prev); done = 1; break; } } if (!done) list_add_tail(new, head); } static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct list_head *queue, int msg_len) { struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; if (!chk->msg->abandoned && (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (queue != &asoc->outqueue.retransmit && !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); } msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); if (msg_len <= 0) break; } return msg_len; } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; struct sctp_stream_out *sout; q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; /* clear out_curr if all frag chunks are pruned */ if (asoc->stream.out_curr == sout && list_is_last(&chk->frag_list, &chk->msg->chunks)) asoc->stream.out_curr = NULL; msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) break; } q->sched->sched_all(&asoc->stream); return msg_len; } /* Abandon the chunks according their priorities */ void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_transport *transport; if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) return; msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &asoc->outqueue.retransmit, msg_len); if (msg_len <= 0) return; list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &transport->transmitted, msg_len); if (msg_len <= 0) return; } sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, __u8 reason) { struct list_head *lchunk, *ltemp; struct sctp_chunk *chunk; /* Walk through the specified transmitted queue. */ list_for_each_safe(lchunk, ltemp, &transport->transmitted) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(lchunk); sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been previousely acked, * stop considering it 'outstanding'. Our peer * will most likely never see it since it will * not be retransmitted */ if (!chunk->tsn_gap_acked) { if (chunk->transport) chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); q->asoc->peer.rwnd += sctp_data_size(chunk); } continue; } /* If we are doing retransmission due to a timeout or pmtu * discovery, only the chunks that are not yet acked should * be added to the retransmit queue. */ if ((reason == SCTP_RTXR_FAST_RTX && (chunk->fast_retransmit == SCTP_NEED_FRTX)) || (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) { /* RFC 2960 6.2.1 Processing a Received SACK * * C) Any time a DATA chunk is marked for * retransmission (via either T3-rtx timer expiration * (Section 6.3.3) or via fast retransmit * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ q->asoc->peer.rwnd += sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); /* sctpimpguide-05 Section 2.8.2 * M5) If a T3-rtx timer expires, the * 'TSN.Missing.Report' of all affected TSNs is set * to 0. */ chunk->tsn_missing_report = 0; /* If a chunk that is being used for RTT measurement * has to be retransmitted, we cannot use this chunk * anymore for RTT measurements. Reset rto_pending so * that a new RTT measurement is started when a new * data chunk is sent. */ if (chunk->rtt_in_progress) { chunk->rtt_in_progress = 0; transport->rto_pending = 0; } /* Move the chunk to the retransmit queue. The chunks * on the retransmit queue are always kept in order. */ list_del_init(lchunk); sctp_insert_list(&q->retransmit, lchunk); } } pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } /* Mark all the eligible packets on a transport for retransmission and force * one packet out. */ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); /* Update the retran path if the T3-rtx timer has expired for * the current retran path. */ if (transport == transport->asoc->peer.retran_path) sctp_assoc_update_retran_path(transport->asoc); transport->asoc->rtx_data_chunks += transport->asoc->unack_data; if (transport->pl.state == SCTP_PL_COMPLETE && transport->asoc->unack_data) sctp_transport_reset_probe_timer(transport); break; case SCTP_RTXR_FAST_RTX: SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); q->fast_rtx = 1; break; case SCTP_RTXR_PMTUD: SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS); break; case SCTP_RTXR_T1_RTX: SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS); transport->asoc->init_retries++; break; default: BUG(); } sctp_retransmit_mark(q, transport, reason); /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); } /* * Transmit DATA chunks on the retransmit queue. Upon return from * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which * need to be transmitted by the caller. * We assume that pkt->transport has already been set. * * The return value is a normal kernel error return value. */ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int rtx_timeout, int *start_timer, gfp_t gfp) { struct sctp_transport *transport = pkt->transport; struct sctp_chunk *chunk, *chunk1; struct list_head *lqueue; enum sctp_xmit status; int error = 0; int timer = 0; int done = 0; int fast_rtx; lqueue = &q->retransmit; fast_rtx = q->fast_rtx; /* This loop handles time-out retransmissions, fast retransmissions, * and retransmissions due to opening of whindow. * * RFC 2960 6.3.3 Handle T3-rtx Expiration * * E3) Determine how many of the earliest (i.e., lowest TSN) * outstanding DATA chunks for the address for which the * T3-rtx has expired will fit into a single packet, subject * to the MTU constraint for the path corresponding to the * destination transport address to which the retransmission * is being sent (this may be different from the address for * which the timer expires [see Section 6.4]). Call this value * K. Bundle and retransmit those K DATA chunks in a single * packet to the destination endpoint. * * [Just to be painfully clear, if we are retransmitting * because a timeout just happened, we should send only ONE * packet of retransmitted data.] * * For fast retransmissions we also send only ONE packet. However, * if we are just flushing the queue due to open window, we'll * try to send as much as possible. */ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(&chunk->transmitted_list); sctp_insert_list(&q->abandoned, &chunk->transmitted_list); continue; } /* Make sure that Gap Acked TSNs are not retransmitted. A * simple approach is just to move such TSNs out of the * way and into a 'transmitted' queue and skip to the * next chunk. */ if (chunk->tsn_gap_acked) { list_move_tail(&chunk->transmitted_list, &transport->transmitted); continue; } /* If we are doing fast retransmit, ignore non-fast_rtransmit * chunks */ if (fast_rtx && !chunk->fast_retransmit) continue; redo: /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); switch (status) { case SCTP_XMIT_PMTU_FULL: if (!pkt->has_data && !pkt->has_cookie_echo) { /* If this packet did not contain DATA then * retransmission did not happen, so do it * again. We'll ignore the error here since * control chunks are already freed so there * is nothing we can do. */ sctp_packet_transmit(pkt, gfp); goto redo; } /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* If we are retransmitting, we should only * send a single packet. * Otherwise, try appending this chunk again. */ if (rtx_timeout || fast_rtx) done = 1; else goto redo; /* Bundle next chunk in the next round. */ break; case SCTP_XMIT_RWND_FULL: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA as there is no more room * at the receiver. */ done = 1; break; case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA because of nagle delay. */ done = 1; break; default: /* The append was successful, so add this chunk to * the transmitted list. */ list_move_tail(&chunk->transmitted_list, &transport->transmitted); /* Mark the chunk as ineligible for fast retransmit * after it is retransmitted. */ if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; q->asoc->stats.rtxchunks++; break; } /* Set the timer if there were no errors */ if (!error && !timer) timer = 1; if (done) break; } /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. */ if (rtx_timeout || fast_rtx) { list_for_each_entry(chunk1, lqueue, transmitted_list) { if (chunk1->fast_retransmit == SCTP_NEED_FRTX) chunk1->fast_retransmit = SCTP_DONT_FRTX; } } *start_timer = timer; /* Clear fast retransmit hint */ if (fast_rtx) q->fast_rtx = 0; return error; } /* Cork the outqueue so queued chunks are really queued. */ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; sctp_outq_flush(q, 0, gfp); } static int sctp_packet_singleton(struct sctp_transport *transport, struct sctp_chunk *chunk, gfp_t gfp) { const struct sctp_association *asoc = transport->asoc; const __u16 sport = asoc->base.bind_addr.port; const __u16 dport = asoc->peer.port; const __u32 vtag = asoc->peer.i.init_tag; struct sctp_packet singleton; sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { list_del_init(&chunk->list); sctp_chunk_free(chunk); return -ENOMEM; } return sctp_packet_transmit(&singleton, gfp); } /* Struct to hold the context during sctp outq flush */ struct sctp_flush_ctx { struct sctp_outq *q; /* Current transport being used. It's NOT the same as curr active one */ struct sctp_transport *transport; /* These transports have chunks to send. */ struct list_head transport_list; struct sctp_association *asoc; /* Packet on the current transport above */ struct sctp_packet *packet; gfp_t gfp; }; /* transport: current transport */ static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx, struct sctp_chunk *chunk) { struct sctp_transport *new_transport = chunk->transport; if (!new_transport) { if (!sctp_chunk_is_data(chunk)) { /* If we have a prior transport pointer, see if * the destination address of the chunk * matches the destination address of the * current transport. If not a match, then * try to look up the transport with a given * destination address. We do this because * after processing ASCONFs, we may have new * transports created. */ if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest, &ctx->transport->ipaddr)) new_transport = ctx->transport; else new_transport = sctp_assoc_lookup_paddr(ctx->asoc, &chunk->dest); } /* if we still don't have a new transport, then * use the current active path. */ if (!new_transport) new_transport = ctx->asoc->peer.active_path; } else { __u8 type; switch (new_transport->state) { case SCTP_INACTIVE: case SCTP_UNCONFIRMED: case SCTP_PF: /* If the chunk is Heartbeat or Heartbeat Ack, * send it to chunk->transport, even if it's * inactive. * * 3.3.6 Heartbeat Acknowledgement: * ... * A HEARTBEAT ACK is always sent to the source IP * address of the IP datagram containing the * HEARTBEAT chunk to which this ack is responding. * ... * * ASCONF_ACKs also must be sent to the source. */ type = chunk->chunk_hdr->type; if (type != SCTP_CID_HEARTBEAT && type != SCTP_CID_HEARTBEAT_ACK && type != SCTP_CID_ASCONF_ACK) new_transport = ctx->asoc->peer.active_path; break; default: break; } } /* Are we switching transports? Take care of transport locks. */ if (new_transport != ctx->transport) { ctx->transport = new_transport; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); /* We've switched transports, so apply the * Burst limit to the new transport. */ sctp_transport_burst_limited(ctx->transport); } } static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) { struct sctp_chunk *chunk, *tmp; enum sctp_xmit status; int one_packet, error; list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) { one_packet = 0; /* RFC 5061, 5.3 * F1) This means that until such time as the ASCONF * containing the add is acknowledged, the sender MUST * NOT use the new IP address as a source for ANY SCTP * packet except on carrying an ASCONF Chunk. */ if (ctx->asoc->src_out_of_asoc_ok && chunk->chunk_hdr->type != SCTP_CID_ASCONF) continue; list_del_init(&chunk->list); /* Pick the right transport to use. Should always be true for * the first chunk as we don't have a transport by then. */ sctp_outq_select_transport(ctx, chunk); switch (chunk->chunk_hdr->type) { /* 6.10 Bundling * ... * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN * COMPLETE with any other chunks. [Send them immediately.] */ case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (error < 0) { ctx->asoc->base.sk->sk_err = -error; return; } ctx->asoc->stats.octrlchunks++; break; case SCTP_CID_ABORT: if (sctp_test_T_bit(chunk)) ctx->packet->vtag = ctx->asoc->c.my_vtag; fallthrough; /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can * send only 1 packet containing these chunks. */ case SCTP_CID_HEARTBEAT_ACK: case SCTP_CID_SHUTDOWN_ACK: case SCTP_CID_COOKIE_ACK: case SCTP_CID_COOKIE_ECHO: case SCTP_CID_ERROR: case SCTP_CID_ECN_CWR: case SCTP_CID_ASCONF_ACK: one_packet = 1; fallthrough; case SCTP_CID_HEARTBEAT: if (chunk->pmtu_probe) { error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (!error) ctx->asoc->stats.octrlchunks++; break; } fallthrough; case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(ctx->packet, chunk, one_packet, ctx->gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &ctx->q->control_chunk_list); break; } ctx->asoc->stats.octrlchunks++; /* PR-SCTP C5) If a FORWARD TSN is sent, the * sender MUST assure that at least one T3-rtx * timer is running. */ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } if (chunk == ctx->asoc->strreset_chunk) sctp_transport_reset_reconf_timer(ctx->transport); break; default: /* We built a chunk with an illegal type! */ BUG(); } } } /* Returns false if new data shouldn't be sent */ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx, int rtx_timeout) { int error, start_timer = 0; if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED) return false; if (ctx->transport != ctx->asoc->peer.retran_path) { /* Switch transports & prepare the packet. */ ctx->transport = ctx->asoc->peer.retran_path; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); } error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout, &start_timer, ctx->gfp); if (error < 0) ctx->asoc->base.sk->sk_err = -error; if (start_timer) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. */ if (ctx->packet->has_cookie_echo) return false; /* Don't send new data if there is still data * waiting to retransmit. */ if (!list_empty(&ctx->q->retransmit)) return false; return true; } static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, int rtx_timeout) { struct sctp_chunk *chunk; enum sctp_xmit status; /* Is it OK to send data chunks? */ switch (ctx->asoc->state) { case SCTP_STATE_COOKIE_ECHOED: /* Only allow bundling when this packet has a COOKIE-ECHO * chunk. */ if (!ctx->packet || !ctx->packet->has_cookie_echo) return; fallthrough; case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: break; default: /* Do nothing. */ return; } /* RFC 2960 6.1 Transmission of DATA Chunks * * C) When the time comes for the sender to transmit, * before sending new DATA chunks, the sender MUST * first transmit any outstanding DATA chunks which * are marked for retransmission (limited by the * current cwnd). */ if (!list_empty(&ctx->q->retransmit) && !sctp_outq_flush_rtx(ctx, rtx_timeout)) return; /* Apply Max.Burst limitation to the current transport in * case it will be used for new data. We are going to * rest it before we return, but we want to apply the limit * to the currently queued data. */ if (ctx->transport) sctp_transport_burst_limited(ctx->transport); /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { sctp_sched_dequeue_done(ctx->q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; } if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } sctp_outq_select_transport(ctx, chunk); pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n", __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), chunk->skb ? chunk->skb->head : NULL, chunk->skb ? refcount_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0, ctx->gfp); if (status != SCTP_XMIT_OK) { /* We could not append this chunk, so put * the chunk back on the output queue. */ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", __func__, ntohl(chunk->subh.data_hdr->tsn), status); sctp_outq_head_data(ctx->q, chunk); break; } /* The sender is in the SHUTDOWN-PENDING state, * The sender MAY set the I-bit in the DATA * chunk header. */ if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING) chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) ctx->asoc->stats.ouodchunks++; else ctx->asoc->stats.oodchunks++; /* Only now it's safe to consider this * chunk as sent, sched-wise. */ sctp_sched_dequeue_done(ctx->q, chunk); list_add_tail(&chunk->transmitted_list, &ctx->transport->transmitted); sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. */ if (ctx->packet->has_cookie_echo) break; } } static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) { struct sock *sk = ctx->asoc->base.sk; struct list_head *ltransport; struct sctp_packet *packet; struct sctp_transport *t; int error = 0; while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) { t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { rcu_read_lock(); if (t->dst && __sk_dst_get(sk) != t->dst) { dst_hold(t->dst); sk_setup_caps(sk, t->dst); } rcu_read_unlock(); error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) ctx->q->asoc->base.sk->sk_err = -error; } /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } } /* Try to flush an outqueue. * * Description: Send everything in q which we legally can, subject to * congestion limitations. * * Note: This function can be called from multiple contexts so appropriate * locking concerns must be made. Today we use the sock lock to protect * this function. */ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_flush_ctx ctx = { .q = q, .transport = NULL, .transport_list = LIST_HEAD_INIT(ctx.transport_list), .asoc = q->asoc, .packet = NULL, .gfp = gfp, }; /* 6.10 Bundling * ... * When bundling control chunks with DATA chunks, an * endpoint MUST place control chunks first in the outbound * SCTP packet. The transmitter MUST transmit DATA chunks * within a SCTP packet in increasing order of TSN. * ... */ sctp_outq_flush_ctrl(&ctx); if (q->asoc->src_out_of_asoc_ok) goto sctp_flush_out; sctp_outq_flush_data(&ctx, rtx_timeout); sctp_flush_out: sctp_outq_flush_transports(&ctx); } /* Update unack_data based on the incoming SACK chunk */ static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) { union sctp_sack_variable *frags; __u16 unack_data; int i; unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; frags = (union sctp_sack_variable *)(sack + 1); for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { unack_data -= ((ntohs(frags[i].gab.end) - ntohs(frags[i].gab.start) + 1)); } assoc->unack_data = unack_data; } /* This is where we REALLY process a SACK. * * Process the SACK against the outqueue. Mostly, this just frees * things off the transmitted queue. */ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) { struct sctp_association *asoc = q->asoc; struct sctp_sackhdr *sack = chunk->subh.sack_hdr; struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; __u32 sack_ctsn, ctsn, tsn; __u32 highest_tsn, highest_new_tsn; __u32 sack_a_rwnd; unsigned int outstanding; struct sctp_transport *primary = asoc->peer.primary_path; int count_of_newacks = 0; int gap_ack_blocks; u8 accum_moved = 0; /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; /* SCTP path tracepoint for congestion control debugging. */ if (trace_sctp_probe_path_enabled()) { list_for_each_entry(transport, transport_list, transports) trace_sctp_probe_path(transport, asoc); } sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; /* * SFR-CACC algorithm: * On receipt of a SACK the sender SHOULD execute the * following statements. * * 1) If the cumulative ack in the SACK passes next tsn_at_change * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for * all destinations. * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE * is set the receiver of the SACK MUST take the following actions: * * A) Initialize the cacc_saw_newack to 0 for all destination * addresses. * * Only bother if changeover_active is set. Otherwise, this is * totally suboptimal to do on every SACK. */ if (primary->cacc.changeover_active) { u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { primary->cacc.changeover_active = 0; clear_cycling = 1; } if (clear_cycling || gap_ack_blocks) { list_for_each_entry(transport, transport_list, transports) { if (clear_cycling) transport->cacc.cycling_changeover = 0; if (gap_ack_blocks) transport->cacc.cacc_saw_newack = 0; } } } /* Get the highest TSN in the sack. */ highest_tsn = sack_ctsn; if (gap_ack_blocks) { union sctp_sack_variable *frags = (union sctp_sack_variable *)(sack + 1); highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); } if (TSN_lt(asoc->highest_sacked, highest_tsn)) asoc->highest_sacked = highest_tsn; highest_new_tsn = sack_ctsn; /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. * * This is a MASSIVE candidate for optimization. */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, transport, &chunk->source, sack, &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of * destinations for which cacc_saw_newack is set. */ if (transport->cacc.cacc_saw_newack) count_of_newacks++; } /* Move the Cumulative TSN Ack Point if appropriate. */ if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { asoc->ctsn_ack_point = sack_ctsn; accum_moved = 1; } if (gap_ack_blocks) { if (asoc->fast_recovery && accum_moved) highest_new_tsn = highest_tsn; list_for_each_entry(transport, transport_list, transports) sctp_mark_missing(q, &transport->transmitted, transport, highest_new_tsn, count_of_newacks); } /* Update unack_data field in the assoc. */ sctp_sack_update_unack_data(asoc, sack); ctsn = asoc->ctsn_ack_point; /* Throw away stuff rotting on the sack queue. */ list_for_each_safe(lchunk, temp, &q->sacked) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } /* ii) Set rwnd equal to the newly received a_rwnd minus the * number of bytes still outstanding after processing the * Cumulative TSN Ack and the Gap Ack Blocks. */ sack_a_rwnd = ntohl(sack->a_rwnd); asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) sack_a_rwnd -= outstanding; else sack_a_rwnd = 0; asoc->peer.rwnd = sack_a_rwnd; asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " "advertised peer ack point:0x%x\n", __func__, asoc, ctsn, asoc->adv_peer_ack_point); return sctp_outq_is_empty(q); } /* Is the outqueue empty? * The queue is empty when we have not pending data, no in-flight data * and nothing pending retransmissions. */ int sctp_outq_is_empty(const struct sctp_outq *q) { return q->out_qlen == 0 && q->outstanding_bytes == 0 && list_empty(&q->retransmit); } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Go through a transport's transmitted list or the association's retransmit * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. * The retransmit list will not have an associated transport. * * I added coherent debug information output. --xguo * * Instead of printing 'sacked' or 'kept' for each TSN on the * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. * KEPT TSN6-TSN7, etc. */ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn_in_sack) { struct list_head *lchunk; struct sctp_chunk *tchunk; struct list_head tlist; __u32 tsn; __u32 sack_ctsn; __u32 rtt; __u8 restart_timer = 0; int bytes_acked = 0; int migrate_bytes = 0; bool forward_progress = false; sack_ctsn = ntohl(sack->cum_tsn_ack); INIT_LIST_HEAD(&tlist); /* The while loop will skip empty transmitted queues. */ while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); if (sctp_chunk_abandoned(tchunk)) { /* Move the chunk to abandoned list. */ sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ if (transmitted_queue != &q->retransmit && !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); q->outstanding_bytes -= sctp_data_size(tchunk); } continue; } tsn = ntohl(tchunk->subh.data_hdr->tsn); if (sctp_acked(sack, tsn)) { /* If this queue is the retransmit queue, the * retransmit timer has already reclaimed * the outstanding bytes for this chunk, so only * count bytes associated with a transport. */ if (transport && !tchunk->tsn_gap_acked) { /* If this chunk is being used for RTT * measurement, calculate the RTT and update * the RTO using this value. * * 6.3.1 C5) Karn's algorithm: RTT measurements * MUST NOT be made using packets that were * retransmitted (and thus for which it is * ambiguous whether the reply was for the * first instance of the packet or a later * instance). */ if (!sctp_chunk_retransmitted(tchunk) && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); } if (TSN_lte(tsn, sack_ctsn)) { /* * SFR-CACC algorithm: * 2) If the SACK contains gap acks * and the flag CHANGEOVER_ACTIVE is * set the receiver of the SACK MUST * take the following action: * * B) For each TSN t being acked that * has not been acked in any SACK so * far, set cacc_saw_newack to 1 for * the destination that the TSN was * sent to. */ if (sack->num_gap_ack_blocks && q->asoc->peer.primary_path->cacc. changeover_active) transport->cacc.cacc_saw_newack = 1; } } /* If the chunk hasn't been marked as ACKED, * mark it and account bytes_acked if the * chunk had a valid transport (it will not * have a transport if ASCONF had deleted it * while DATA was outstanding). */ if (!tchunk->tsn_gap_acked) { tchunk->tsn_gap_acked = 1; if (TSN_lt(*highest_new_tsn_in_sack, tsn)) *highest_new_tsn_in_sack = tsn; bytes_acked += sctp_data_size(tchunk); if (!tchunk->transport) migrate_bytes += sctp_data_size(tchunk); forward_progress = true; } if (TSN_lte(tsn, sack_ctsn)) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R3) Whenever a SACK is received * that acknowledges the DATA chunk * with the earliest outstanding TSN * for that address, restart T3-rtx * timer for that address with its * current RTO. */ restart_timer = 1; forward_progress = true; list_add_tail(&tchunk->transmitted_list, &q->sacked); } else { /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 * M2) Each time a SACK arrives reporting * 'Stray DATA chunk(s)' record the highest TSN * reported as newly acknowledged, call this * value 'HighestTSNinSack'. A newly * acknowledged DATA chunk is one not * previously acknowledged in a SACK. * * When the SCTP sender of data receives a SACK * chunk that acknowledges, for the first time, * the receipt of a DATA chunk, all the still * unacknowledged DATA chunks whose TSN is * older than that newly acknowledged DATA * chunk, are qualified as 'Stray DATA chunks'. */ list_add_tail(lchunk, &tlist); } } else { if (tchunk->tsn_gap_acked) { pr_debug("%s: receiver reneged on data TSN:0x%x\n", __func__, tsn); tchunk->tsn_gap_acked = 0; if (tchunk->transport) bytes_acked -= sctp_data_size(tchunk); /* RFC 2960 6.3.2 Retransmission Timer Rules * * R4) Whenever a SACK is received missing a * TSN that was previously acknowledged via a * Gap Ack Block, start T3-rtx for the * destination address to which the DATA * chunk was originally * transmitted if it is not already running. */ restart_timer = 1; } list_add_tail(lchunk, &tlist); } } if (transport) { if (bytes_acked) { struct sctp_association *asoc = transport->asoc; /* We may have counted DATA that was migrated * to this transport due to DEL-IP operation. * Subtract those bytes, since the were never * send on this transport and shouldn't be * credited to this transport. */ bytes_acked -= migrate_bytes; /* 8.2. When an outstanding TSN is acknowledged, * the endpoint shall clear the error counter of * the destination transport address to which the * DATA chunk was last sent. * The association's overall error counter is * also cleared. */ transport->error_count = 0; transport->asoc->overall_error_count = 0; forward_progress = true; /* * While in SHUTDOWN PENDING, we may have started * the T5 shutdown guard timer after reaching the * retransmission limit. Stop that timer as soon * as the receiver acknowledged any data. */ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && timer_delete(&asoc->timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) sctp_association_put(asoc); /* Mark the destination transport address as * active if it is not so marked. */ if ((transport->state == SCTP_INACTIVE || transport->state == SCTP_UNCONFIRMED) && sctp_cmp_addr_exact(&transport->ipaddr, saddr)) { sctp_assoc_control_transport( transport->asoc, transport, SCTP_TRANSPORT_UP, SCTP_RECEIVED_SACK); } sctp_transport_raise_cwnd(transport, sack_ctsn, bytes_acked); transport->flight_size -= bytes_acked; if (transport->flight_size == 0) transport->partial_bytes_acked = 0; q->outstanding_bytes -= bytes_acked + migrate_bytes; } else { /* RFC 2960 6.1, sctpimpguide-06 2.15.2 * When a sender is doing zero window probing, it * should not timeout the association if it continues * to receive new packets from the receiver. The * reason is that the receiver MAY keep its window * closed for an indefinite time. * A sender is doing zero window probing when the * receiver's advertised window is zero, and there is * only one data chunk in flight to the receiver. * * Allow the association to timeout while in SHUTDOWN * PENDING or SHUTDOWN RECEIVED in case the receiver * stays in zero window mode forever. */ if (!q->asoc->peer.rwnd && !list_empty(&tlist) && (sack_ctsn+2 == q->asoc->next_tsn) && q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { pr_debug("%s: sack received for zero window " "probe:%u\n", __func__, sack_ctsn); q->asoc->overall_error_count = 0; transport->error_count = 0; } } /* RFC 2960 6.3.2 Retransmission Timer Rules * * R2) Whenever all outstanding data sent to an address have * been acknowledged, turn off the T3-rtx timer of that * address. */ if (!transport->flight_size) { if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } if (forward_progress) { if (transport->dst) sctp_transport_dst_confirm(transport); } } list_splice(&tlist, transmitted_queue); } /* Mark chunks as missing and consequently may get retransmitted. */ static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn_in_sack, int count_of_newacks) { struct sctp_chunk *chunk; __u32 tsn; char do_fast_retransmit = 0; struct sctp_association *asoc = q->asoc; struct sctp_transport *primary = asoc->peer.primary_path; list_for_each_entry(chunk, transmitted_queue, transmitted_list) { tsn = ntohl(chunk->subh.data_hdr->tsn); /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all * 'Unacknowledged TSN's', if the TSN number of an * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' * value, increment the 'TSN.Missing.Report' count on that * chunk if it has NOT been fast retransmitted or marked for * fast retransmit already. */ if (chunk->fast_retransmit == SCTP_CAN_FRTX && !chunk->tsn_gap_acked && TSN_lt(tsn, highest_new_tsn_in_sack)) { /* SFR-CACC may require us to skip marking * this chunk as missing. */ if (!transport || !sctp_cacc_skip(primary, chunk->transport, count_of_newacks, tsn)) { chunk->tsn_missing_report++; pr_debug("%s: tsn:0x%x missing counter:%d\n", __func__, tsn, chunk->tsn_missing_report); } } /* * M4) If any DATA chunk is found to have a * 'TSN.Missing.Report' * value larger than or equal to 3, mark that chunk for * retransmission and start the fast retransmit procedure. */ if (chunk->tsn_missing_report >= 3) { chunk->fast_retransmit = SCTP_NEED_FRTX; do_fast_retransmit = 1; } } if (transport) { if (do_fast_retransmit) sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } } /* Is the given TSN acked by this packet? */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { __u32 ctsn = ntohl(sack->cum_tsn_ack); union sctp_sack_variable *frags; __u16 tsn_offset, blocks; int i; if (TSN_lte(tsn, ctsn)) goto pass; /* 3.3.4 Selective Acknowledgment (SACK) (3): * * Gap Ack Blocks: * These fields contain the Gap Ack Blocks. They are repeated * for each Gap Ack Block up to the number of Gap Ack Blocks * defined in the Number of Gap Ack Blocks field. All DATA * chunks with TSNs greater than or equal to (Cumulative TSN * Ack + Gap Ack Block Start) and less than or equal to * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack * Block are assumed to have been received correctly. */ frags = (union sctp_sack_variable *)(sack + 1); blocks = ntohs(sack->num_gap_ack_blocks); tsn_offset = tsn - ctsn; for (i = 0; i < blocks; ++i) { if (tsn_offset >= ntohs(frags[i].gab.start) && tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } return 0; pass: return 1; } static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, int nskips, __be16 stream) { int i; for (i = 0; i < nskips; i++) { if (skiplist[i].stream == stream) return i; } return i; } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct sctp_fwdtsn_skip ftsn_skip_arr[10]; int nskips = 0; int skip_pos = 0; __u32 tsn; struct sctp_chunk *chunk; struct list_head *lchunk, *temp; if (!asoc->peer.prsctp_capable) return; /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * * If (Advanced.Peer.Ack.Point < SackCumAck), then update * Advanced.Peer.Ack.Point to be equal to SackCumAck. */ if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as * the chunk next in the out-queue space is marked as "abandoned" as * shown in the following example: * * Assuming that a SACK arrived with the Cumulative TSN ACK 102 * and the Advanced.Peer.Ack.Point is updated to this value: * * out-queue at the end of ==> out-queue after Adv.Ack.Point * normal SACK processing local advancement * ... ... * Adv.Ack.Pt-> 102 acked 102 acked * 103 abandoned 103 abandoned * 104 abandoned Adv.Ack.P-> 104 abandoned * 105 105 * 106 acked 106 acked * ... ... * * In this example, the data sender successfully advanced the * "Advanced.Peer.Ack.Point" from 102 to 104 locally. */ list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); /* Remove any chunks in the abandoned queue that are acked by * the ctsn. */ if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else { if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { asoc->adv_peer_ack_point = tsn; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) continue; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, chunk->subh.data_hdr->stream); ftsn_skip_arr[skip_pos].stream = chunk->subh.data_hdr->stream; ftsn_skip_arr[skip_pos].ssn = chunk->subh.data_hdr->ssn; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else break; } } /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" * is greater than the Cumulative TSN ACK carried in the received * SACK, the data sender MUST send the data receiver a FORWARD TSN * chunk containing the latest value of the * "Advanced.Peer.Ack.Point". * * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD * list each stream and sequence number in the forwarded TSN. This * information will enable the receiver to easily find any * stranded TSN's waiting on stream reorder queues. Each stream * SHOULD only be reported once; this means that if multiple * abandoned messages occur in the same stream then only the * highest abandoned stream sequence number is reported. If the * total size of the FORWARD TSN does NOT fit in a single MTU then * the sender of the FORWARD TSN SHOULD lower the * Advanced.Peer.Ack.Point to the last TSN that will fit in a * single MTU. */ if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> * * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU LESSER GENERAL PUBLIC LICENSE * version 2.1 as published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU LESSER GENERAL PUBLIC LICENSE for more details. * */ #ifndef mISDNIF_H #define mISDNIF_H #include <linux/types.h> #include <linux/errno.h> #include <linux/socket.h> /* * ABI Version 32 bit * * <8 bit> Major version * - changed if any interface become backwards incompatible * * <8 bit> Minor version * - changed if any interface is extended but backwards compatible * * <16 bit> Release number * - should be incremented on every checkin */ #define MISDN_MAJOR_VERSION 1 #define MISDN_MINOR_VERSION 1 #define MISDN_RELEASE 29 /* primitives for information exchange * generell format * <16 bit 0 > * <8 bit command> * BIT 8 = 1 LAYER private * BIT 7 = 1 answer * BIT 6 = 1 DATA * <8 bit target layer mask> * * Layer = 00 is reserved for general commands Layer = 01 L2 -> HW Layer = 02 HW -> L2 Layer = 04 L3 -> L2 Layer = 08 L2 -> L3 * Layer = FF is reserved for broadcast commands */ #define MISDN_CMDMASK 0xff00 #define MISDN_LAYERMASK 0x00ff /* generell commands */ #define OPEN_CHANNEL 0x0100 #define CLOSE_CHANNEL 0x0200 #define CONTROL_CHANNEL 0x0300 #define CHECK_DATA 0x0400 /* layer 2 -> layer 1 */ #define PH_ACTIVATE_REQ 0x0101 #define PH_DEACTIVATE_REQ 0x0201 #define PH_DATA_REQ 0x2001 #define MPH_ACTIVATE_REQ 0x0501 #define MPH_DEACTIVATE_REQ 0x0601 #define MPH_INFORMATION_REQ 0x0701 #define PH_CONTROL_REQ 0x0801 /* layer 1 -> layer 2 */ #define PH_ACTIVATE_IND 0x0102 #define PH_ACTIVATE_CNF 0x4102 #define PH_DEACTIVATE_IND 0x0202 #define PH_DEACTIVATE_CNF 0x4202 #define PH_DATA_IND 0x2002 #define PH_DATA_E_IND 0x3002 #define MPH_ACTIVATE_IND 0x0502 #define MPH_DEACTIVATE_IND 0x0602 #define MPH_INFORMATION_IND 0x0702 #define PH_DATA_CNF 0x6002 #define PH_CONTROL_IND 0x0802 #define PH_CONTROL_CNF 0x4802 /* layer 3 -> layer 2 */ #define DL_ESTABLISH_REQ 0x1004 #define DL_RELEASE_REQ 0x1104 #define DL_DATA_REQ 0x3004 #define DL_UNITDATA_REQ 0x3104 #define DL_INFORMATION_REQ 0x0004 /* layer 2 -> layer 3 */ #define DL_ESTABLISH_IND 0x1008 #define DL_ESTABLISH_CNF 0x5008 #define DL_RELEASE_IND 0x1108 #define DL_RELEASE_CNF 0x5108 #define DL_DATA_IND 0x3008 #define DL_UNITDATA_IND 0x3108 #define DL_INFORMATION_IND 0x0008 /* intern layer 2 management */ #define MDL_ASSIGN_REQ 0x1804 #define MDL_ASSIGN_IND 0x1904 #define MDL_REMOVE_REQ 0x1A04 #define MDL_REMOVE_IND 0x1B04 #define MDL_STATUS_UP_IND 0x1C04 #define MDL_STATUS_DOWN_IND 0x1D04 #define MDL_STATUS_UI_IND 0x1E04 #define MDL_ERROR_IND 0x1F04 #define MDL_ERROR_RSP 0x5F04 /* intern layer 2 */ #define DL_TIMER200_IND 0x7004 #define DL_TIMER203_IND 0x7304 #define DL_INTERN_MSG 0x7804 /* DL_INFORMATION_IND types */ #define DL_INFO_L2_CONNECT 0x0001 #define DL_INFO_L2_REMOVED 0x0002 /* PH_CONTROL types */ /* TOUCH TONE IS 0x20XX XX "0"..."9", "A","B","C","D","*","#" */ #define DTMF_TONE_VAL 0x2000 #define DTMF_TONE_MASK 0x007F #define DTMF_TONE_START 0x2100 #define DTMF_TONE_STOP 0x2200 #define DTMF_HFC_COEF 0x4000 #define DSP_CONF_JOIN 0x2403 #define DSP_CONF_SPLIT 0x2404 #define DSP_RECEIVE_OFF 0x2405 #define DSP_RECEIVE_ON 0x2406 #define DSP_ECHO_ON 0x2407 #define DSP_ECHO_OFF 0x2408 #define DSP_MIX_ON 0x2409 #define DSP_MIX_OFF 0x240a #define DSP_DELAY 0x240b #define DSP_JITTER 0x240c #define DSP_TXDATA_ON 0x240d #define DSP_TXDATA_OFF 0x240e #define DSP_TX_DEJITTER 0x240f #define DSP_TX_DEJ_OFF 0x2410 #define DSP_TONE_PATT_ON 0x2411 #define DSP_TONE_PATT_OFF 0x2412 #define DSP_VOL_CHANGE_TX 0x2413 #define DSP_VOL_CHANGE_RX 0x2414 #define DSP_BF_ENABLE_KEY 0x2415 #define DSP_BF_DISABLE 0x2416 #define DSP_BF_ACCEPT 0x2416 #define DSP_BF_REJECT 0x2417 #define DSP_PIPELINE_CFG 0x2418 #define HFC_VOL_CHANGE_TX 0x2601 #define HFC_VOL_CHANGE_RX 0x2602 #define HFC_SPL_LOOP_ON 0x2603 #define HFC_SPL_LOOP_OFF 0x2604 /* for T30 FAX and analog modem */ #define HW_MOD_FRM 0x4000 #define HW_MOD_FRH 0x4001 #define HW_MOD_FTM 0x4002 #define HW_MOD_FTH 0x4003 #define HW_MOD_FTS 0x4004 #define HW_MOD_CONNECT 0x4010 #define HW_MOD_OK 0x4011 #define HW_MOD_NOCARR 0x4012 #define HW_MOD_FCERROR 0x4013 #define HW_MOD_READY 0x4014 #define HW_MOD_LASTDATA 0x4015 /* DSP_TONE_PATT_ON parameter */ #define TONE_OFF 0x0000 #define TONE_GERMAN_DIALTONE 0x0001 #define TONE_GERMAN_OLDDIALTONE 0x0002 #define TONE_AMERICAN_DIALTONE 0x0003 #define TONE_GERMAN_DIALPBX 0x0004 #define TONE_GERMAN_OLDDIALPBX 0x0005 #define TONE_AMERICAN_DIALPBX 0x0006 #define TONE_GERMAN_RINGING 0x0007 #define TONE_GERMAN_OLDRINGING 0x0008 #define TONE_AMERICAN_RINGPBX 0x000b #define TONE_GERMAN_RINGPBX 0x000c #define TONE_GERMAN_OLDRINGPBX 0x000d #define TONE_AMERICAN_RINGING 0x000e #define TONE_GERMAN_BUSY 0x000f #define TONE_GERMAN_OLDBUSY 0x0010 #define TONE_AMERICAN_BUSY 0x0011 #define TONE_GERMAN_HANGUP 0x0012 #define TONE_GERMAN_OLDHANGUP 0x0013 #define TONE_AMERICAN_HANGUP 0x0014 #define TONE_SPECIAL_INFO 0x0015 #define TONE_GERMAN_GASSENBESETZT 0x0016 #define TONE_GERMAN_AUFSCHALTTON 0x0016 /* MPH_INFORMATION_IND */ #define L1_SIGNAL_LOS_OFF 0x0010 #define L1_SIGNAL_LOS_ON 0x0011 #define L1_SIGNAL_AIS_OFF 0x0012 #define L1_SIGNAL_AIS_ON 0x0013 #define L1_SIGNAL_RDI_OFF 0x0014 #define L1_SIGNAL_RDI_ON 0x0015 #define L1_SIGNAL_SLIP_RX 0x0020 #define L1_SIGNAL_SLIP_TX 0x0021 /* * protocol ids * D channel 1-31 * B channel 33 - 63 */ #define ISDN_P_NONE 0 #define ISDN_P_BASE 0 #define ISDN_P_TE_S0 0x01 #define ISDN_P_NT_S0 0x02 #define ISDN_P_TE_E1 0x03 #define ISDN_P_NT_E1 0x04 #define ISDN_P_TE_UP0 0x05 #define ISDN_P_NT_UP0 0x06 #define IS_ISDN_P_TE(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_TE_E1) || \ (p == ISDN_P_TE_UP0) || (p == ISDN_P_LAPD_TE)) #define IS_ISDN_P_NT(p) ((p == ISDN_P_NT_S0) || (p == ISDN_P_NT_E1) || \ (p == ISDN_P_NT_UP0) || (p == ISDN_P_LAPD_NT)) #define IS_ISDN_P_S0(p) ((p == ISDN_P_TE_S0) || (p == ISDN_P_NT_S0)) #define IS_ISDN_P_E1(p) ((p == ISDN_P_TE_E1) || (p == ISDN_P_NT_E1)) #define IS_ISDN_P_UP0(p) ((p == ISDN_P_TE_UP0) || (p == ISDN_P_NT_UP0)) #define ISDN_P_LAPD_TE 0x10 #define ISDN_P_LAPD_NT 0x11 #define ISDN_P_B_MASK 0x1f #define ISDN_P_B_START 0x20 #define ISDN_P_B_RAW 0x21 #define ISDN_P_B_HDLC 0x22 #define ISDN_P_B_X75SLP 0x23 #define ISDN_P_B_L2DTMF 0x24 #define ISDN_P_B_L2DSP 0x25 #define ISDN_P_B_L2DSPHDLC 0x26 #define ISDN_P_B_T30_FAX 0x27 #define ISDN_P_B_MODEM_ASYNC 0x28 #define OPTION_L2_PMX 1 #define OPTION_L2_PTP 2 #define OPTION_L2_FIXEDTEI 3 #define OPTION_L2_CLEANUP 4 #define OPTION_L1_HOLD 5 /* should be in sync with linux/kobject.h:KOBJ_NAME_LEN */ #define MISDN_MAX_IDLEN 20 struct mISDNhead { unsigned int prim; unsigned int id; } __packed; #define MISDN_HEADER_LEN sizeof(struct mISDNhead) #define MAX_DATA_SIZE 2048 #define MAX_DATA_MEM (MAX_DATA_SIZE + MISDN_HEADER_LEN) #define MAX_DFRAME_LEN 260 #define MISDN_ID_ADDR_MASK 0xFFFF #define MISDN_ID_TEI_MASK 0xFF00 #define MISDN_ID_SAPI_MASK 0x00FF #define MISDN_ID_TEI_ANY 0x7F00 #define MISDN_ID_ANY 0xFFFF #define MISDN_ID_NONE 0xFFFE #define GROUP_TEI 127 #define TEI_SAPI 63 #define CTRL_SAPI 0 #define MISDN_MAX_CHANNEL 127 #define MISDN_CHMAP_SIZE ((MISDN_MAX_CHANNEL + 1) >> 3) #define SOL_MISDN 0 struct sockaddr_mISDN { sa_family_t family; unsigned char dev; unsigned char channel; unsigned char sapi; unsigned char tei; }; struct mISDNversion { unsigned char major; unsigned char minor; unsigned short release; }; struct mISDN_devinfo { u_int id; u_int Dprotocols; u_int Bprotocols; u_int protocol; u_char channelmap[MISDN_CHMAP_SIZE]; u_int nrbchan; char name[MISDN_MAX_IDLEN]; }; struct mISDN_devrename { u_int id; char name[MISDN_MAX_IDLEN]; /* new name */ }; /* MPH_INFORMATION_REQ payload */ struct ph_info_ch { __u32 protocol; __u64 Flags; }; struct ph_info_dch { struct ph_info_ch ch; __u16 state; __u16 num_bch; }; struct ph_info { struct ph_info_dch dch; struct ph_info_ch bch[]; }; /* timer device ioctl */ #define IMADDTIMER _IOR('I', 64, int) #define IMDELTIMER _IOR('I', 65, int) /* socket ioctls */ #define IMGETVERSION _IOR('I', 66, int) #define IMGETCOUNT _IOR('I', 67, int) #define IMGETDEVINFO _IOR('I', 68, int) #define IMCTRLREQ _IOR('I', 69, int) #define IMCLEAR_L2 _IOR('I', 70, int) #define IMSETDEVNAME _IOR('I', 71, struct mISDN_devrename) #define IMHOLD_L1 _IOR('I', 72, int) static inline int test_channelmap(u_int nr, u_char *map) { if (nr <= MISDN_MAX_CHANNEL) return map[nr >> 3] & (1 << (nr & 7)); else return 0; } static inline void set_channelmap(u_int nr, u_char *map) { map[nr >> 3] |= (1 << (nr & 7)); } static inline void clear_channelmap(u_int nr, u_char *map) { map[nr >> 3] &= ~(1 << (nr & 7)); } /* CONTROL_CHANNEL parameters */ #define MISDN_CTRL_GETOP 0x0000 #define MISDN_CTRL_LOOP 0x0001 #define MISDN_CTRL_CONNECT 0x0002 #define MISDN_CTRL_DISCONNECT 0x0004 #define MISDN_CTRL_RX_BUFFER 0x0008 #define MISDN_CTRL_PCMCONNECT 0x0010 #define MISDN_CTRL_PCMDISCONNECT 0x0020 #define MISDN_CTRL_SETPEER 0x0040 #define MISDN_CTRL_UNSETPEER 0x0080 #define MISDN_CTRL_RX_OFF 0x0100 #define MISDN_CTRL_FILL_EMPTY 0x0200 #define MISDN_CTRL_GETPEER 0x0400 #define MISDN_CTRL_L1_TIMER3 0x0800 #define MISDN_CTRL_HW_FEATURES_OP 0x2000 #define MISDN_CTRL_HW_FEATURES 0x2001 #define MISDN_CTRL_HFC_OP 0x4000 #define MISDN_CTRL_HFC_PCM_CONN 0x4001 #define MISDN_CTRL_HFC_PCM_DISC 0x4002 #define MISDN_CTRL_HFC_CONF_JOIN 0x4003 #define MISDN_CTRL_HFC_CONF_SPLIT 0x4004 #define MISDN_CTRL_HFC_RECEIVE_OFF 0x4005 #define MISDN_CTRL_HFC_RECEIVE_ON 0x4006 #define MISDN_CTRL_HFC_ECHOCAN_ON 0x4007 #define MISDN_CTRL_HFC_ECHOCAN_OFF 0x4008 #define MISDN_CTRL_HFC_WD_INIT 0x4009 #define MISDN_CTRL_HFC_WD_RESET 0x400A /* special RX buffer value for MISDN_CTRL_RX_BUFFER request.p1 is the minimum * buffer size request.p2 the maximum. Using MISDN_CTRL_RX_SIZE_IGNORE will * not change the value, but still read back the actual stetting. */ #define MISDN_CTRL_RX_SIZE_IGNORE -1 /* socket options */ #define MISDN_TIME_STAMP 0x0001 struct mISDN_ctrl_req { int op; int channel; int p1; int p2; }; /* muxer options */ #define MISDN_OPT_ALL 1 #define MISDN_OPT_TEIMGR 2 #ifdef __KERNEL__ #include <linux/list.h> #include <linux/skbuff.h> #include <linux/net.h> #include <net/sock.h> #include <linux/completion.h> #define DEBUG_CORE 0x000000ff #define DEBUG_CORE_FUNC 0x00000002 #define DEBUG_SOCKET 0x00000004 #define DEBUG_MANAGER 0x00000008 #define DEBUG_SEND_ERR 0x00000010 #define DEBUG_MSG_THREAD 0x00000020 #define DEBUG_QUEUE_FUNC 0x00000040 #define DEBUG_L1 0x0000ff00 #define DEBUG_L1_FSM 0x00000200 #define DEBUG_L2 0x00ff0000 #define DEBUG_L2_FSM 0x00020000 #define DEBUG_L2_CTRL 0x00040000 #define DEBUG_L2_RECV 0x00080000 #define DEBUG_L2_TEI 0x00100000 #define DEBUG_L2_TEIFSM 0x00200000 #define DEBUG_TIMER 0x01000000 #define DEBUG_CLOCK 0x02000000 #define mISDN_HEAD_P(s) ((struct mISDNhead *)&s->cb[0]) #define mISDN_HEAD_PRIM(s) (((struct mISDNhead *)&s->cb[0])->prim) #define mISDN_HEAD_ID(s) (((struct mISDNhead *)&s->cb[0])->id) /* socket states */ #define MISDN_OPEN 1 #define MISDN_BOUND 2 #define MISDN_CLOSED 3 struct mISDNchannel; struct mISDNdevice; struct mISDNstack; struct mISDNclock; struct channel_req { u_int protocol; struct sockaddr_mISDN adr; struct mISDNchannel *ch; }; typedef int (ctrl_func_t)(struct mISDNchannel *, u_int, void *); typedef int (send_func_t)(struct mISDNchannel *, struct sk_buff *); typedef int (create_func_t)(struct channel_req *); struct Bprotocol { struct list_head list; char *name; u_int Bprotocols; create_func_t *create; }; struct mISDNchannel { struct list_head list; u_int protocol; u_int nr; u_long opt; u_int addr; struct mISDNstack *st; struct mISDNchannel *peer; send_func_t *send; send_func_t *recv; ctrl_func_t *ctrl; }; struct mISDN_sock_list { struct hlist_head head; rwlock_t lock; }; struct mISDN_sock { struct sock sk; struct mISDNchannel ch; u_int cmask; struct mISDNdevice *dev; }; struct mISDNdevice { struct mISDNchannel D; u_int id; u_int Dprotocols; u_int Bprotocols; u_int nrbchan; u_char channelmap[MISDN_CHMAP_SIZE]; struct list_head bchannels; struct mISDNchannel *teimgr; struct device dev; }; struct mISDNstack { u_long status; struct mISDNdevice *dev; struct task_struct *thread; struct completion *notify; wait_queue_head_t workq; struct sk_buff_head msgq; struct list_head layer2; struct mISDNchannel *layer1; struct mISDNchannel own; struct mutex lmutex; /* protect lists */ struct mISDN_sock_list l1sock; #ifdef MISDN_MSG_STATS u_int msg_cnt; u_int sleep_cnt; u_int stopped_cnt; #endif }; typedef int (clockctl_func_t)(void *, int); struct mISDNclock { struct list_head list; char name[64]; int pri; clockctl_func_t *ctl; void *priv; }; /* global alloc/queue functions */ static inline struct sk_buff * mI_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + MISDN_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, MISDN_HEADER_LEN); return skb; } static inline struct sk_buff * _alloc_mISDN_skb(u_int prim, u_int id, u_int len, void *dp, gfp_t gfp_mask) { struct sk_buff *skb = mI_alloc_skb(len, gfp_mask); struct mISDNhead *hh; if (!skb) return NULL; if (len) skb_put_data(skb, dp, len); hh = mISDN_HEAD_P(skb); hh->prim = prim; hh->id = id; return skb; } static inline void _queue_data(struct mISDNchannel *ch, u_int prim, u_int id, u_int len, void *dp, gfp_t gfp_mask) { struct sk_buff *skb; if (!ch->peer) return; skb = _alloc_mISDN_skb(prim, id, len, dp, gfp_mask); if (!skb) return; if (ch->recv(ch->peer, skb)) dev_kfree_skb(skb); } /* global register/unregister functions */ extern int mISDN_register_device(struct mISDNdevice *, struct device *parent, char *name); extern void mISDN_unregister_device(struct mISDNdevice *); extern int mISDN_register_Bprotocol(struct Bprotocol *); extern void mISDN_unregister_Bprotocol(struct Bprotocol *); extern struct mISDNclock *mISDN_register_clock(char *, int, clockctl_func_t *, void *); extern void mISDN_unregister_clock(struct mISDNclock *); static inline struct mISDNdevice *dev_to_mISDN(const struct device *dev) { if (dev) return dev_get_drvdata(dev); else return NULL; } extern void set_channel_address(struct mISDNchannel *, u_int, u_int); extern void mISDN_clock_update(struct mISDNclock *, int, ktime_t *); extern unsigned short mISDN_clock_get(void); extern const char *mISDNDevName4ch(struct mISDNchannel *); #endif /* __KERNEL__ */ #endif /* mISDNIF_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ipv6.h> #include <net/dsfield.h> #include <net/xfrm.h> #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 static inline void xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); } static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph = ipv6_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = 0; XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; XFRM_MODE_SKB_CB(skb)->optlen = 0; memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); #else WARN_ON_ONCE(1); #endif } static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); iph->version = 6; memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(iph->flow_lbl)); iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; } static inline void xfrm4_beet_make_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->ihl = 5; iph->version = 4; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; iph->tos = XFRM_MODE_SKB_CB(skb)->tos; iph->id = XFRM_MODE_SKB_CB(skb)->id; iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; } #endif
585 25101 189 25121 120 120 1335 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF * @ref: Pointer to the reference count * * Semantically it is equivalent to calling file_ref_put(), but it trades lower * performance in face of other CPUs also modifying the refcount for higher * performance when this happens to be the last reference. * * For the last reference file_ref_put() issues 2 atomics. One to drop the * reference and another to transition it to FILE_REF_DEAD. This routine does * the work in one step, but in order to do it has to pre-read the variable which * decreases scalability. * * Use with close() et al, stick to file_ref_put() by default. */ static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) { long old; old = atomic_long_read(&ref->refcnt); if (likely(old == FILE_REF_ONEREF)) { if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) return true; } return file_ref_put(ref); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } /* * __file_ref_read_raw - Return the value stored in ref->refcnt * @ref: Pointer to the reference count * * Return: The raw value found in the counter * * A hack for file_needs_f_pos_lock(), you probably want to use * file_ref_read() instead. */ static inline unsigned long __file_ref_read_raw(file_ref_t *ref) { return atomic_long_read(&ref->refcnt); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_HOST_H #define _SCSI_SCSI_HOST_H #include <linux/device.h> #include <linux/list.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> struct block_device; struct completion; struct module; struct scsi_cmnd; struct scsi_device; struct scsi_target; struct Scsi_Host; struct scsi_transport_template; #define SG_ALL SG_CHUNK_SIZE #define MODE_UNKNOWN 0x00 #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 /** * enum scsi_timeout_action - How to handle a command that timed out. * @SCSI_EH_DONE: The command has already been completed. * @SCSI_EH_RESET_TIMER: Reset the timer and continue waiting for completion. * @SCSI_EH_NOT_HANDLED: The command has not yet finished. Abort the command. */ enum scsi_timeout_action { SCSI_EH_DONE, SCSI_EH_RESET_TIMER, SCSI_EH_NOT_HANDLED, }; struct scsi_host_template { /* * Put fields referenced in IO submission path together in * same cacheline */ /* * Additional per-command data allocated for the driver. */ unsigned int cmd_size; /* * The queuecommand function is used to queue up a scsi * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * * If queuecommand returns 0, then the driver has accepted the * command. It must also push it to the HBA if the scsi_cmnd * flag SCMD_LAST is set, or if the driver does not implement * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). * * Queuecommand may also reject the command, in which case it may * not touch the command and must not call done() for it. * * There are two possible rejection returns: * * SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but * allow commands to other devices serviced by this host. * * SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this * host temporarily. * * For compatibility, any other non-zero return is treated the * same as SCSI_MLQUEUE_HOST_BUSY. * * NOTE: "temporarily" means either until the next command for# * this device/host completes, or a period of time determined by * I/O pressure in the system if there are no other outstanding * commands. * * STATUS: REQUIRED */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with * queuecommand, when an error is encountered before sending * the request with SCMD_LAST set. * * STATUS: OPTIONAL */ void (*commit_rqs)(struct Scsi_Host *, u16); struct module *module; const char *name; /* * The info function will return whatever useful information the * developer sees fit. If not provided, then the name field will * be used instead. * * Status: OPTIONAL */ const char *(*info)(struct Scsi_Host *); /* * Ioctl interface * * Status: OPTIONAL */ int (*ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT /* * Compat handler. Handle 32bit ABI. * When unknown ioctl is passed return -ENOIOCTLCMD. * * Status: OPTIONAL */ int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #endif int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default * routine that is present that should work in most cases. For those * driver authors that have the inclination and ability to write their * own strategy routine, this is where it is specified. Note - the * strategy routine is *ALWAYS* run in the context of the kernel eh * thread. Thus you are guaranteed to *NOT* be in an interrupt * handler when you execute this, and you are also guaranteed to * *NOT* have any other commands being queued while you are in the * strategy routine. When you return from this function, operations * return to normal. * * See scsi_error.c scsi_unjam_host for additional comments about * what this function should and should not be attempting to do. * * Status: REQUIRED (at least one of them) */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *); /* * Before the mid layer attempts to scan for a new device where none * currently exists, it will call this entry in your driver. Should * your driver need to allocate any structs or perform any other init * items in order to send commands to a currently unused target/lun * combo, then this is where you can perform those allocations. This * is specifically so that drivers won't have to perform any kind of * "is this a new device" checks in their queuecommand routine, * thereby making the hot path a bit quicker. * * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to sdev_destroy(). If we find something * here then you will get a call to sdev_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to sdev_destroy(). This is * assuming you implement sdev_configure and sdev_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the sdev_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ int (* sdev_init)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the * device is online, we call into the low level driver with the * struct scsi_device *. If the low level device driver implements * this function, it *must* perform the task of setting the queue * depth on the device. All other tasks are optional and depend * on what the driver supports and various implementation details. * * Things currently recommended to be handled at this time include: * * 1. Setting the device queue depth. Proper setting of this is * described in the comments for scsi_change_queue_depth. * 2. Determining if the device supports the various synchronous * negotiation protocols. The device struct will already have * responded to INQUIRY and the results of the standard items * will have been shoved into the various device flag bits, eg. * device->sdtr will be true if the device supports SDTR messages. * 3. Allocating command structs that the device will need. * 4. Setting the default timeout on this device (if needed). * 5. Anything else the low level driver might want to do on a device * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return * non-0, your sdev_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * * Status: OPTIONAL */ int (* sdev_configure)(struct scsi_device *, struct queue_limits *lim); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory * it allocated in the sdev_init or sdev_configure calls. * * Status: OPTIONAL */ void (* sdev_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached * to a target where no target currently exists, it will call this * entry in your driver. Should your driver need to allocate any * structs or perform any other init items in order to send commands * to a currently unused target, then this is where you can perform * those allocations. * * Return values: 0 on success, non-0 on failure * * Status: OPTIONAL */ int (* target_alloc)(struct scsi_target *); /* * Immediately prior to deallocating the target structure, and * after all activity to attached scsi devices has ceased, the * midlayer calls this point so that the driver may deallocate * and terminate any references to the target. * * Note: This callback is called with the host lock held and hence * must not sleep. * * Status: OPTIONAL */ void (* target_destroy)(struct scsi_target *); /* * If a host has the ability to discover targets on its own instead * of scanning the entire bus, it can fill in this function and * call scsi_scan_host(). This function will be called periodically * until it returns 1 with the scsi_host and the elapsed time of * the scan in jiffies. * * Status: OPTIONAL */ int (* scan_finished)(struct Scsi_Host *, unsigned long); /* * If the host wants to be called before the scan starts, but * after the midlayer has set up ready for the scan, it can fill * in this function. * * Status: OPTIONAL */ void (* scan_start)(struct Scsi_Host *); /* * Fill in this function to allow the queue depth of this host * to be changeable (on a per device basis). Returns either * the current queue depth setting (may be different from what * was passed in) or an error. An error should only be * returned if the requested depth is legal but the driver was * unable to set it. If the requested depth is illegal, the * driver should set and return the closest legal queue depth. * * Status: OPTIONAL */ int (* change_queue_depth)(struct scsi_device *, int); /* * This functions lets the driver expose the queue mapping * to the block layer. * * Status: OPTIONAL */ void (* map_queues)(struct Scsi_Host *shost); /* * SCSI interface of blk_poll - poll for IO completions. * Only applicable if SCSI LLD exposes multiple h/w queues. * * Return value: Number of completed entries found. * * Status: OPTIONAL */ int (* mq_poll)(struct Scsi_Host *shost, unsigned int queue_num); /* * Check if scatterlists need to be padded for DMA draining. * * Status: OPTIONAL */ bool (* dma_need_drain)(struct request *rq); /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by * the host adapter. Parameters: * size, device, list (heads, sectors, cylinders) * * Status: OPTIONAL */ int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* * This function is called when one or more partitions on the * device reach beyond the end of the device. * * Status: OPTIONAL */ void (*unlock_native_capacity)(struct scsi_device *); /* * Can be used to export driver statistics and other infos to the * world outside the kernel ie. userspace and it also provides an * interface to feed the driver with information. * * Status: OBSOLETE */ int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); /* * This is an optional routine that allows the transport to become * involved when a scsi io timer fires. The return value tells the * timer routine how to finish the io timeout handling. * * Status: OPTIONAL */ enum scsi_timeout_action (*eh_timed_out)(struct scsi_cmnd *); /* * Optional routine that allows the transport to decide if a cmd * is retryable. Return true if the transport is in a state the * cmd should be retried on. */ bool (*eh_should_retry_cmd)(struct scsi_cmnd *scmd); /* This is an optional routine that allows transport to initiate * LLD adapter or firmware reset using sysfs attribute. * * Return values: 0 on success, -ve value on failure. * * Status: OPTIONAL */ int (*host_reset)(struct Scsi_Host *shost, int reset_type); #define SCSI_ADAPTER_RESET 1 #define SCSI_FIRMWARE_RESET 2 /* * Name of proc directory */ const char *proc_name; /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number * of simultaneous commands a single hw queue in HBA will accept. */ int can_queue; /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is * the case, then it must be reserved. Please set this_id to -1 if * your setup is in single initiator mode, and the host lacks an * ID. */ int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ unsigned int max_sectors; /* * Maximum size in bytes of a single segment. */ unsigned int max_segment_size; unsigned int dma_alignment; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * This specifies "machine infinity" for host templates which don't * limit the transfer size. Note this limit represents an absolute * maximum, and may be over the transfer limits allowed for * individual devices (e.g. 256 for SCSI-1). */ #define SCSI_DEFAULT_MAX_SECTORS 1024 /* * True if this host adapter can make good use of linked commands. * This will allow more than one command to be queued to a given * unit on a given host. Set this to the maximum number of command * blocks to be provided for each device. Set this to 1 for one * command block per lun, 2 for two, etc. Do not set this to 0. * You should make sure that the host adapter will do the right thing * before you try setting this above 1. */ short cmd_per_lun; /* * Allocate tags starting from last allocated tag. */ bool tag_alloc_policy_rr : 1; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; /* * Default value for the blocking. If the queue is empty, * host_blocked counts down in the request_fn until it restarts * host operations as zero is reached. * * FIXME: This should probably be a value in the template */ #define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the SCSI host sysfs attribute groups, NULL terminated. */ const struct attribute_group **shost_groups; /* * Pointer to the SCSI device attribute groups for this host, * NULL terminated. */ const struct attribute_group **sdev_groups; /* * Vendor Identifier associated with the host * * Note: When specifying vendor_id, be sure to read the * Vendor Type and ID formatting requirements specified in * scsi_netlink.h */ u64 vendor_id; }; /* * Temporary #define for host lock push down. Can be removed when all * drivers have been updated to take advantage of unlocked * queuecommand. * */ #define DEF_SCSI_QCMD(func_name) \ int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \ { \ unsigned long irq_flags; \ int rc; \ spin_lock_irqsave(shost->host_lock, irq_flags); \ rc = func_name##_lck(cmd); \ spin_unlock_irqrestore(shost->host_lock, irq_flags); \ return rc; \ } /* * shost state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_host_set_state() */ enum scsi_host_state { SHOST_CREATED = 1, SHOST_RUNNING, SHOST_CANCEL, SHOST_DEL, SHOST_RECOVERY, SHOST_CANCEL_RECOVERY, SHOST_DEL_RECOVERY, }; struct Scsi_Host { /* * __devices is protected by the host_lock, but you should * usually use scsi_device_lookup / shost_for_each_device * to access it and don't care about locking yourself. * In the rare case of being in irq context you can use * their __ prefixed variants with the lock held. NEVER * access this list directly from a driver. */ struct list_head __devices; struct list_head __targets; struct list_head starved_list; spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_abort_list; struct list_head eh_cmd_q; struct task_struct * ehandler; /* Error recovery thread. */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; const struct scsi_host_template *hostt; struct scsi_transport_template *transportt; struct kref tagset_refcnt; struct completion tagset_freed; /* Area to keep a shared tag map */ struct blk_mq_tag_set tag_set; atomic_t host_blocked; unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; unsigned int max_id; u64 max_lun; /* * This is a unique identifier that must be assigned so that we * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is * initialized to 0 in scsi_host_alloc. */ unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned int opt_sectors; unsigned int max_segment_size; unsigned int dma_alignment; unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. However, for when host_tagset is set, * the total queue depth is can_queue. */ unsigned nr_hw_queues; unsigned nr_maps; unsigned active_mode:2; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; /* Asynchronous scan in progress */ unsigned async_scan:1; /* Don't resume host in EH */ unsigned eh_noresume:1; /* The controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Optional work queue to be utilized by the transport */ struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* legacy crap */ unsigned long base; unsigned long io_port; unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; /* ldm bits */ struct device shost_gendev, shost_dev; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ unsigned long hostdata[] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long)))); }; #define class_to_shost(d) \ container_of(d, struct Scsi_Host, shost_dev) #define shost_printk(prefix, shost, fmt, a...) \ dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a) static inline void *shost_priv(struct Scsi_Host *shost) { return (void *)shost->hostdata; } int scsi_is_host_device(const struct device *); static inline struct Scsi_Host *dev_to_shost(struct device *dev) { while (!scsi_is_host_device(dev)) { if (!dev->parent) return NULL; dev = dev->parent; } return container_of(dev, struct Scsi_Host, shost_gendev); } static inline int scsi_host_in_recovery(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RECOVERY || shost->shost_state == SHOST_CANCEL_RECOVERY || shost->shost_state == SHOST_DEL_RECOVERY || shost->tmf_in_progress; } extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *, int); extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, struct device *, struct device *); #if defined(CONFIG_SCSI_PROC_FS) struct proc_dir_entry * scsi_template_proc_dir(const struct scsi_host_template *sht); #else #define scsi_template_proc_dir(sht) NULL #endif extern void scsi_scan_host(struct Scsi_Host *); extern int scsi_resume_device(struct scsi_device *sdev); extern int scsi_rescan_device(struct scsi_device *sdev); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); extern void scsi_host_put(struct Scsi_Host *t); extern struct Scsi_Host *scsi_host_lookup(unsigned int hostnum); extern const char *scsi_host_state_name(enum scsi_host_state); extern void scsi_host_complete_all_commands(struct Scsi_Host *shost, enum scsi_host_status status); static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) { return scsi_add_host_with_dma(host, dev, dev); } static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; } /** * scsi_host_scan_allowed - Is scanning of this host allowed * @shost: Pointer to Scsi_Host. **/ static inline int scsi_host_scan_allowed(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RUNNING || shost->shost_state == SHOST_RECOVERY; } extern void scsi_unblock_requests(struct Scsi_Host *); extern void scsi_block_requests(struct Scsi_Host *); extern int scsi_host_block(struct Scsi_Host *shost); extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state); void scsi_host_busy_iter(struct Scsi_Host *, bool (*fn)(struct scsi_cmnd *, void *), void *priv); struct class_container; /* * DIF defines the exchange of protection information between * initiator and SBC block device. * * DIX defines the exchange of protection information between OS and * initiator. */ enum scsi_host_prot_capabilities { SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */ SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */ SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */ SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */ SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */ SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */ SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */ }; /* * SCSI hosts which support the Data Integrity Extensions must * indicate their capabilities by setting the prot_capabilities using * this call. */ static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask) { shost->prot_capabilities = mask; } static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) { return shost->prot_capabilities; } static inline int scsi_host_prot_dma(struct Scsi_Host *shost) { return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; } static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, SHOST_DIF_TYPE1_PROTECTION, SHOST_DIF_TYPE2_PROTECTION, SHOST_DIF_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type] ? target_type : 0; } static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type) { #if defined(CONFIG_BLK_DEV_INTEGRITY) static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION, SHOST_DIX_TYPE1_PROTECTION, SHOST_DIX_TYPE2_PROTECTION, SHOST_DIX_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type]; #endif return 0; } /* * All DIX-capable initiators must support the T10-mandated CRC * checksum. Controllers can optionally implement the IP checksum * scheme which has much lower impact on system performance. Note * that the main rationale for the checksum is to match integrity * metadata with data. Detecting bit errors are a job for ECC memory * and buses. */ enum scsi_host_guard_type { SHOST_DIX_GUARD_CRC = 1 << 0, SHOST_DIX_GUARD_IP = 1 << 1, }; static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type) { shost->prot_guard_type = type; } static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost) { return shost->prot_guard_type; } extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state); #endif /* _SCSI_SCSI_HOST_H */
39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 /* * An async IO implementation for Linux * Written by Benjamin LaHaise <bcrl@kvack.org> * * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> #include <linux/highmem.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/migrate.h> #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include "internal.h" #define KIOCB_KEY 0 #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[]; }; /* 128 bytes + ring size */ /* * Plugging is meant to work with larger batches of IOs. If we don't * have more than the below, then don't bother setting up a plug. */ #define AIO_PLUG_THRESHOLD 2 #define AIO_RING_PAGES 8 struct kioctx_table { struct rcu_head rcu; unsigned nr; struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { unsigned reqs_available; }; struct ctx_rq_wait { struct completion comp; atomic_t count; }; struct kioctx { struct percpu_ref users; atomic_t dead; struct percpu_ref reqs; unsigned long user_id; struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global * counter at a time: */ unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring()) */ unsigned max_reqs; /* Size of ringbuffer, in units of struct io_event */ unsigned nr_events; unsigned long mmap_base; unsigned long mmap_size; struct folio **ring_folios; long nr_pages; struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done */ struct ctx_rq_wait *rq_wait; struct { /* * This counts the number of available slots in the ringbuffer, * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. * * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { spinlock_t ctx_lock; struct list_head active_reqs; /* used for cancellation */ } ____cacheline_aligned_in_smp; struct { struct mutex ring_lock; wait_queue_head_t wait; } ____cacheline_aligned_in_smp; struct { unsigned tail; unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; struct folio *internal_folios[AIO_RING_PAGES]; struct file *aio_ring_file; unsigned id; }; /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> */ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; struct cred *creds; }; struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; bool cancelled; bool work_scheduled; bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, * or directly as just 'ki_filp' in this struct. */ struct aio_kiocb { union { struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; }; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); static unsigned long aio_nr; /* current system wide number of aio requests */ static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL static const struct ctl_table aio_sysctls[] = { { .procname = "aio-nr", .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static void __init aio_sysctl_init(void) { register_sysctl_init("fs", aio_sysctls); } #else #define aio_sysctl_init() do { } while (0) #endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", O_RDWR, &aio_ring_fops); if (IS_ERR(file)) iput(inode); return file; } static int aio_init_fs_context(struct fs_context *fc) { if (!init_pseudo(fc, AIO_RING_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; return 0; } /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */ static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", .init_fs_context = aio_init_fs_context, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); aio_sysctl_init(); return 0; } __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; struct address_space *i_mapping; if (aio_ring_file) { truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->i_private_lock); i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } } static void aio_free_ring(struct kioctx *ctx) { int i; /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ put_aio_ring_file(ctx); for (i = 0; i < ctx->nr_pages; i++) { struct folio *folio = ctx->ring_folios[i]; if (!folio) continue; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); ctx->ring_folios[i] = NULL; folio_put(folio); } if (ctx->ring_folios && ctx->ring_folios != ctx->internal_folios) { kfree(ctx->ring_folios); ctx->ring_folios = NULL; } } static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table) goto out_unlock; for (i = 0; i < table->nr; i++) { struct kioctx *ctx; ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } break; } } out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; } static const struct vm_operations_struct aio_ring_vm_ops = { .mremap = aio_ring_mremap, #if IS_ENABLED(CONFIG_MMU) .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, #endif }; static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { desc->vm_flags |= VM_DONTEXPAND; desc->vm_ops = &aio_ring_vm_ops; return 0; } static const struct file_operations aio_ring_fops = { .mmap_prepare = aio_ring_mmap_prepare, }; #if IS_ENABLED(CONFIG_MIGRATION) static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; pgoff_t idx; int rc = 0; /* mapping->i_private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->i_private_lock); ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; } /* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx. */ if (!mutex_trylock(&ctx->ring_lock)) { rc = -EAGAIN; goto out; } idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old folio hasn't already been changed */ if (ctx->ring_folios[idx] != src) rc = -EAGAIN; } else rc = -EINVAL; if (rc != 0) goto out_unlock; /* Writeback must be complete */ BUG_ON(folio_test_writeback(src)); folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc) { folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); folio_copy(dst, src); folio_migrate_flags(dst, src); BUG_ON(ctx->ring_folios[idx] != src); ctx->ring_folios[idx] = dst; spin_unlock_irqrestore(&ctx->completion_lock, flags); /* The old folio is no longer accessible. */ folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); out: spin_unlock(&mapping->i_private_lock); return rc; } #else #define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; int i; struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; return -ENOMEM; } ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ctx->ring_folios = ctx->internal_folios; if (nr_pages > AIO_RING_PAGES) { ctx->ring_folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_KERNEL); if (!ctx->ring_folios) { put_aio_ring_file(ctx); return -ENOMEM; } } for (i = 0; i < nr_pages; i++) { struct folio *folio; folio = __filemap_get_folio(file->f_mapping, i, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_USER | __GFP_ZERO); if (IS_ERR(folio)) break; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); folio_end_read(folio, true); ctx->ring_folios[i] = folio; } ctx->nr_pages = i; if (unlikely(i != nr_pages)) { aio_free_ring(ctx); return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); if (mmap_write_lock_killable(mm)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; } ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = folio_address(ctx->ring_folios[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); flush_dcache_folio(ctx->ring_folios[0]); return 0; } #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { struct aio_kiocb *req; struct kioctx *ctx; unsigned long flags; /* * kiocb didn't come from aio or is neither a read nor a write, hence * ignore it. */ if (!(iocb->ki_flags & IOCB_AIO_RW)) return; req = container_of(iocb, struct aio_kiocb, rw); if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; ctx = req->ki_ctx; spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work. */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); queue_rcu_work(system_percpu_wq, &ctx->free_rwork); } /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); percpu_ref_kill(&ctx->reqs); percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { unsigned i, new_nr; struct kioctx_table *table, *old; struct aio_ring *ring; spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) { ctx->id = i; rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, * we are protected from page migration * changes ring_folios by ->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); ring->id = ctx->id; return 0; } new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL); if (!table) return -ENOMEM; table->nr = new_nr; spin_lock(&mm->ioctx_lock); old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); } else if (table->nr > old->nr) { memcpy(table->table, old->table, old->nr * sizeof(struct kioctx *)); rcu_assign_pointer(mm->ioctx_table, table); kfree_rcu(old, rcu); } else { kfree(table); table = old; } } } static void aio_nr_sub(unsigned nr) { spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr)) aio_nr = 0; else aio_nr -= nr; spin_unlock(&aio_nr_lock); } /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; /* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes. */ unsigned int max_reqs = nr_events; /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * * So since up to half the slots might be on other cpu's percpu counters * and unavailable, double nr_events so userspace sees what they * expected: additionally, we move req_batch slots to/from percpu * counters at a time, so make sure that isn't 0: */ nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2; /* Prevent overflows */ if (nr_events > (0x10000000U / sizeof(struct io_event))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); /* Protect against page migration throughout kiotx setup by keeping * the ring_lock mutex held until setup is complete. */ mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); if (ctx->req_batch < 1) ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + ctx->max_reqs > aio_max_nr || aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) goto err_cleanup; /* Release the ring_lock mutex now that all setup is complete. */ mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: atomic_set(&ctx->dead, 1); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, struct ctx_rq_wait *wait) { struct kioctx_table *table; spin_lock(&mm->ioctx_lock); if (atomic_xchg(&ctx->dead, 1)) { spin_unlock(&mm->ioctx_lock); return -EINVAL; } table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all * the outstanding kiocbs have finished - but by then io_destroy * has already returned, so io_setup() could potentially return * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be * called on the context. * * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); struct ctx_rq_wait wait; int i, skipped; if (!table) return; atomic_set(&wait.count, table->nr); init_completion(&wait.comp); skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; continue; } /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, * this is not necessarily our ->mm. * Since kill_ioctx() uses non-zero ->mmap_size as indicator * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; kill_ioctx(mm, ctx, &wait); } if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } local_irq_restore(flags); } static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; } while (!atomic_try_cmpxchg(&ctx->reqs_available, &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } ret = true; kcpu->reqs_available--; out: local_irq_restore(flags); return ret; } /* refill_reqs_available * Updates the reqs_available reference counts used for tracking the * number of free slots in the completion ring. This can be called * from aio_complete() (to optimistically update reqs_available) or * from aio_get_req() (the we're out of events case). It must be * called holding ctx->completion_lock. */ static void refill_reqs_available(struct kioctx *ctx, unsigned head, unsigned tail) { unsigned events_in_ring, completed; /* Clamp head since userland can write to it. */ head %= ctx->nr_events; if (head <= tail) events_in_ring = tail - head; else events_in_ring = ctx->nr_events - (head - tail); completed = ctx->completed_events; if (events_in_ring < completed) completed -= events_in_ring; else completed = 0; if (!completed) return; ctx->completed_events -= completed; put_reqs_available(ctx, completed); } /* user_refill_reqs_available * Called to refill reqs_available when aio_get_req() encounters an * out of space in the completion ring. */ static void user_refill_reqs_available(struct kioctx *ctx) { spin_lock_irq(&ctx->completion_lock); if (ctx->completed_events) { struct aio_ring *ring; unsigned head; /* Access of ring->head may race with aio_read_events_ring() * here, but that's okay since whether we read the old version * or the new version, and either will be valid. The important * part is that head cannot pass tail since we prevent * aio_complete() from updating tail by holding * ctx->completion_lock. Even if head is invalid, the check * against ctx->completed_events below will make sure we do the * safe/right thing. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; refill_reqs_available(ctx, head, ctx->tail); } spin_unlock_irq(&ctx->completion_lock); } static bool get_reqs_available(struct kioctx *ctx) { if (__get_reqs_available(ctx)) return true; user_refill_reqs_available(ctx); return __get_reqs_available(ctx); } /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. * * The refcount is initialized to 2 - one for the async op completion, * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL; if (unlikely(!get_reqs_available(ctx))) { kmem_cache_free(kiocb_cachep, req); return NULL; } percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table || id >= table->nr) goto out; id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } out: rcu_read_unlock(); return ret; } static inline void iocb_destroy(struct aio_kiocb *iocb) { if (iocb->ki_eventfd) eventfd_ctx_put(iocb->ki_eventfd); if (iocb->ki_filp) fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } struct aio_waiter { struct wait_queue_entry w; size_t min_nr; }; /* aio_complete * Called when the io request on the given iocb is complete. */ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head, avail; unsigned long flags; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail; pos = tail + AIO_EVENTS_OFFSET; if (++tail >= ctx->nr_events) tail = 0; ev_page = folio_address(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; flush_dcache_folio(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ ctx->tail = tail; ring = folio_address(ctx->ring_folios[0]); head = ring->head; ring->tail = tail; flush_dcache_folio(ctx->ring_folios[0]); ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); avail = tail > head ? tail - head : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is * like in wake_up_bit() where clearing a bit has to be * ordered with the unlocked test. */ smp_mb(); if (waitqueue_active(&ctx->wait)) { struct aio_waiter *curr, *next; unsigned long flags; spin_lock_irqsave(&ctx->wait.lock, flags); list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) if (avail >= curr->min_nr) { wake_up_process(curr->w.private); list_del_init_careful(&curr->w.entry); } spin_unlock_irqrestore(&ctx->wait.lock, flags); } } static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_dec_and_test(&iocb->ki_refcnt)) { aio_complete(iocb); iocb_destroy(iocb); } } /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; unsigned head, tail, pos; long ret = 0; int copy_ret; /* * The mutex can block and wake us up and that will cause * wait_event_interruptible_hrtimeout() to schedule without sleeping * and repeat. This should be rare enough that it doesn't cause * peformance issues. See the comment in read_events() for more detail. */ sched_annotate_sleep(); mutex_lock(&ctx->ring_lock); /* Access to ->ring_folios here is protected by ctx->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; tail = ring->tail; /* * Ensure that once we've read the current tail pointer, that * we also see the events that were stored up to the tail. */ smp_rmb(); pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) goto out; head %= ctx->nr_events; tail %= ctx->nr_events; while (ret < nr) { long avail; struct io_event *ev; struct folio *folio; avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; pos = head + AIO_EVENTS_OFFSET; folio = ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); ev = folio_address(folio); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); if (unlikely(copy_ret)) { ret = -EFAULT; goto out; } ret += avail; head += avail; head %= ctx->nr_events; } ring = folio_address(ctx->ring_folios[0]); ring->head = head; flush_dcache_folio(ctx->ring_folios[0]); pr_debug("%li h%u t%u\n", ret, head, tail); out: mutex_unlock(&ctx->ring_lock); return ret; } static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { long ret = aio_read_events_ring(ctx, event + *i, nr - *i); if (ret > 0) *i += ret; if (unlikely(atomic_read(&ctx->dead))) ret = -EINVAL; if (!*i) *i = ret; return ret < 0 || *i >= min_nr; } static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { struct hrtimer_sleeper t; struct aio_waiter w; long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to * TASK_INTERRUPTIBLE. * * But aio_read_events() can block, and if it blocks it's going to flip * the task state back to TASK_RUNNING. * * This should be ok, provided it doesn't flip the state back to * TASK_RUNNING and return 0 too much - that causes us to spin. That * will only happen if the mutex_lock() call blocks, and we then find * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ aio_read_events(ctx, min_nr, nr, event, &ret); if (until == 0 || ret < 0 || ret >= min_nr) return ret; hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (until != KTIME_MAX) { hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); } init_wait(&w.w); while (1) { unsigned long nr_got = ret; w.min_nr = min_nr - ret; ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); if (!ret2 && !t.task) ret2 = -ETIME; if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) break; if (nr_got == ret) schedule(); } finish_wait(&ctx->wait, &w.w); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); return ret; } /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and * must be initialized to 0 prior to the call. On successful * creation of the aio_context, *ctxp is filled in with the resulting * handle. May fail with -EINVAL if *ctxp is not initialized, * if the specified nr_events exceeds internal limits. May fail * with -EAGAIN if the specified nr_events exceeds the user's limit * of available events. May fail with -ENOMEM if insufficient kernel * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. */ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctx32p); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { /* truncating is ok because it's a user address */ ret = put_user((u32)ioctx->user_id, ctx32p); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #endif /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { struct ctx_rq_wait wait; int ret; init_completion(&wait.comp); atomic_set(&wait.count, 1); /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel * keep using user-space buffers even if user thinks the context * is destroyed. */ if (!ret) wait_for_completion(&wait.comp); return ret; } pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } static void aio_remove_iocb(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; unsigned long flags; spin_lock_irqsave(&ctx->ctx_lock, flags); list_del(&iocb->ki_list); spin_unlock_irqrestore(&ctx->ctx_lock, flags); } static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); if (!list_empty_careful(&iocb->ki_list)) aio_remove_iocb(iocb); if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); if (S_ISREG(inode->i_mode)) kiocb_end_write(kiocb); } iocb->ki_res.res = res; iocb->ki_res.res2 = 0; iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; req->ki_write_stream = 0; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then * aio_reqprio is interpreted as an I/O scheduling * class and priority. */ ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); return ret; } req->ki_ioprio = iocb->aio_reqprio; } else req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; } static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; if (!vectored) { ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * There's no easy way to restart the syscall since other AIO's * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; fallthrough; default: req->ki_complete(req, ret); } } static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; if (unlikely(!file->f_op->read_iter)) return -EINVAL; ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, file->f_op->read_iter(req, &iter)); kfree(iovec); return ret; } static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(!file->f_op->write_iter)) return -EINVAL; ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, file->f_op->write_iter(req, &iter)); } kfree(iovec); return ret; } static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); scoped_with_creds(iocb->fsync.creds) iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); put_cred(iocb->fsync.creds); iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; if (unlikely(!req->file->f_op->fsync)) return -EINVAL; req->creds = prepare_creds(); if (!req->creds) return -ENOMEM; req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); return 0; } static void aio_poll_put_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); iocb_put(iocb); } /* * Safely lock the waitqueue which the request is on, synchronizing with the * case where the ->poll() provider decides to free its waitqueue early. * * Returns true on success, meaning that req->head->lock was locked, req->wait * is on req->head, and an RCU read lock was taken. Returns false if the * request was already removed from its waitqueue (which might no longer exist). */ static bool poll_iocb_lock_wq(struct poll_iocb *req) { wait_queue_head_t *head; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us, then check whether the request is still on the queue. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); head = smp_load_acquire(&req->head); if (head) { spin_lock(&head->lock); if (!list_empty(&req->wait.entry)) return true; spin_unlock(&head->lock); } rcu_read_unlock(); return false; } static void poll_iocb_unlock_wq(struct poll_iocb *req) { spin_unlock(&req->head->lock); rcu_read_unlock(); } static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); struct poll_table_struct pt = { ._key = req->events }; struct kioctx *ctx = iocb->ki_ctx; __poll_t mask = 0; if (!READ_ONCE(req->cancelled)) mask = vfs_poll(req->file, &pt) & req->events; /* * Note that ->ki_cancel callers also delete iocb from active_reqs after * calling ->ki_cancel. We need the ctx_lock roundtrip here to * synchronize with them. In the cancellation case the list_del_init * itself is not actually needed, but harmless so we keep it in to * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { if (!mask && !READ_ONCE(req->cancelled)) { /* * The request isn't actually ready to be completed yet. * Reschedule completion if another wakeup came in. */ if (req->work_need_resched) { schedule_work(&req->work); req->work_need_resched = false; } else { req->work_scheduled = false; } poll_iocb_unlock_wq(req); spin_unlock_irq(&ctx->ctx_lock); return; } list_del_init(&req->wait.entry); poll_iocb_unlock_wq(req); } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); } /* assumes we are called with irqs disabled */ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; if (poll_iocb_lock_wq(req)) { WRITE_ONCE(req->cancelled, true); if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); req->work_scheduled = true; } poll_iocb_unlock_wq(req); } /* else, the request was force-cancelled by POLLFREE already */ return 0; } static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) return 0; /* * Complete the request inline if possible. This requires that three * conditions be met: * 1. An event mask must have been passed. If a plain wakeup was done * instead, then mask == 0 and we have to call vfs_poll() to get * the events, so inline completion isn't possible. * 2. The completion work must not have already been scheduled. * 3. ctx_lock must not be busy. We have to use trylock because we * already hold the waitqueue lock, so this inverts the normal * locking order. Use irqsave/irqrestore because not all * filesystems (e.g. fuse) call this function with IRQs disabled, * yet IRQs have to be disabled before ctx_lock is obtained. */ if (mask && !req->work_scheduled && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); } spin_unlock_irqrestore(&ctx->ctx_lock, flags); if (iocb) iocb_put(iocb); } else { /* * Schedule the completion work if needed. If it was already * scheduled, record that another wakeup came in. * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() * is called), and we must not miss any wakeups. POLLFREE is an * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; } else { schedule_work(&req->work); req->work_scheduled = true; } /* * If the waitqueue is being freed early but we can't complete * the request inline, we have to tear down the request as best * we can. That means immediately removing the request from its * waitqueue and preventing all further accesses to the * waitqueue via the request. We also need to schedule the * completion work (done above). Also mark the request as * cancelled, to potentially skip an unneeded call to ->poll(). */ if (mask & POLLFREE) { WRITE_ONCE(req->cancelled, true); list_del_init(&req->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&req->head, NULL); } } return 1; } struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; bool queued; int error; }; static void aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); } static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ if ((u16)iocb->aio_buf != iocb->aio_buf) return -EINVAL; /* reject fields that are not defined for poll */ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL; INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; req->cancelled = false; req->work_scheduled = false; req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); if (likely(apt.queued)) { bool on_queue = poll_iocb_lock_wq(req); if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. */ if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } if (on_queue) poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; } spin_unlock_irq(&ctx->ctx_lock); if (mask) iocb_put(aiocb); return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, struct iocb __user *user_iocb, struct aio_kiocb *req, bool compat) { req->ki_filp = fget(iocb->aio_fildes); if (unlikely(!req->ki_filp)) return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ eventfd = eventfd_ctx_fdget(iocb->aio_resfd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); req->ki_eventfd = eventfd; } if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } req->ki_res.obj = (u64)(unsigned long)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); return -EINVAL; } } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; struct iocb iocb; int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; /* enforce forwards compatibility on users */ if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( (iocb.aio_buf != (unsigned long)iocb.aio_buf) || (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } req = aio_get_req(ctx); if (unlikely(!req)) return -EAGAIN; err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); /* Done with the synchronous reference */ iocb_put(req); /* * If err is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ if (unlikely(err)) { iocb_destroy(req); put_reqs_available(ctx, 1); } return err; } /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, int, nr, compat_uptr_t __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { compat_uptr_t user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, compat_ptr(user_iocb), true); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #endif /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is * copied into the memory pointed to by result without being placed * into the completion queue and 0 is returned. May fail with * -EFAULT if any of the data structures pointed to are invalid. * May fail with -EINVAL if aio_context specified by ctx_id is * invalid. May fail with -EAGAIN if the iocb specified was not * cancelled. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; if (unlikely(key != KIOCB_KEY)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; } } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { /* * The result argument is no longer used - the io_event is * always delivered via the ring buffer. -EINPROGRESS indicates * cancellation is progress: */ ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); return ret; } static long do_io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct timespec64 *ts) { ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; struct kioctx *ioctx = lookup_ioctx(ctx_id); long ret = -EINVAL; if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, until); percpu_ref_put(&ioctx->users); } return ret; } /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If * it succeeds, the number of read events is returned. May fail with * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is * out of range, if timeout is out of range. May fail with -EFAULT * if any of the memory specified is invalid. May return 0 or * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ #ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout) { struct timespec64 ts; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif struct __aio_sigset { const sigset_t __user *sigmask; size_t sigsetsize; }; SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(io_pgetevents_time32, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, __s32, min_nr, __s32, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif #ifdef CONFIG_COMPAT struct __compat_aio_sigset { compat_uptr_t sigmask; compat_size_t sigsetsize; }; #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_timespec64(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif
39 54 60 32 45 60 161 162 161 161 23 131 131 131 24 256 257 209 24 24 24 710 709 610 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> #include <linux/nstree.h> #include <linux/proc_ns.h> #include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); break; #endif #ifdef CONFIG_IPC_NS case CLONE_NEWIPC: VFS_WARN_ON_ONCE(ops != &ipcns_operations); break; #endif case CLONE_NEWNS: VFS_WARN_ON_ONCE(ops != &mntns_operations); break; #ifdef CONFIG_NET_NS case CLONE_NEWNET: VFS_WARN_ON_ONCE(ops != &netns_operations); break; #endif #ifdef CONFIG_PID_NS case CLONE_NEWPID: VFS_WARN_ON_ONCE(ops != &pidns_operations); break; #endif #ifdef CONFIG_TIME_NS case CLONE_NEWTIME: VFS_WARN_ON_ONCE(ops != &timens_operations); break; #endif #ifdef CONFIG_USER_NS case CLONE_NEWUSER: VFS_WARN_ON_ONCE(ops != &userns_operations); break; #endif #ifdef CONFIG_UTS_NS case CLONE_NEWUTS: VFS_WARN_ON_ONCE(ops != &utsns_operations); break; #endif } } #endif int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { int ret = 0; refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; ns_tree_node_init(&ns->ns_tree_node); ns_tree_node_init(&ns->ns_unified_node); ns_tree_node_init(&ns->ns_owner_node); ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif if (inum) ns->inum = inum; else ret = proc_alloc_inum(&ns->inum); if (ret) return ret; /* * Tree ref starts at 0. It's incremented when namespace enters * active use (installed in nsproxy) and decremented when all * active uses are gone. Initial namespaces are always active. */ if (is_ns_init_inum(ns)) atomic_set(&ns->__ns_ref_active, 1); else atomic_set(&ns->__ns_ref_active, 0); return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } struct ns_common *__must_check ns_owner(struct ns_common *ns) { struct user_namespace *owner; if (unlikely(!ns->ops)) return NULL; VFS_WARN_ON_ONCE(!ns->ops->owner); owner = ns->ops->owner(ns); VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); if (!owner) return NULL; /* Skip init_user_ns as it's always active */ if (owner == &init_user_ns) return NULL; return to_ns_common(owner); } /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. * That single reference is only released once the child namespace's * active count itself goes down. * * A regular namespace tree might look as follow: * Legend: * + : adding active reference * - : dropping active reference * x : always active (initial namespace) * * * net_ns pid_ns * \ / * + + * user_ns1 (2) * | * ipc_ns | uts_ns * \ | / * + + + * user_ns2 (3) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If both net_ns and pid_ns put their last active reference on * themselves it will cascade to user_ns1 dropping its own active * reference and dropping one active reference on user_ns2: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * + - + * user_ns2 (2) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * The iteration stops once we reach a namespace that still has active * references. */ void __ns_ref_active_put(struct ns_common *ns) { /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; if (!atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; } VFS_WARN_ON_ONCE(is_ns_init_id(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); for (;;) { ns = ns_owner(ns); if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (!atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; } } } /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. * That single reference is only released once the child namespace's * active count itself goes down. This makes it possible to efficiently * resurrect a namespace tree: * * A regular namespace tree might look as follow: * Legend: * + : adding active reference * - : dropping active reference * x : always active (initial namespace) * * * net_ns pid_ns * \ / * + + * user_ns1 (2) * | * ipc_ns | uts_ns * \ | / * + + + * user_ns2 (3) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If both net_ns and pid_ns put their last active reference on * themselves it will cascade to user_ns1 dropping its own active * reference and dropping one active reference on user_ns2: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * + - + * user_ns2 (2) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * Assume the whole tree is dead but all namespaces are still active: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * - - - * user_ns2 (0) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): * * net_ns pid_ns * \ / * + - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * - + - * user_ns2 (0) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If net_ns had a zero reference count and we bumped it we also need to * take another reference on its owning user namespace. Similarly, if * pid_ns had a zero reference count it also needs to take another * reference on its owning user namespace. So both net_ns and pid_ns * will each have their own reference on the owning user namespace. * * If the owning user namespace user_ns1 had a zero reference count then * it also needs to take another reference on its owning user namespace * and so on. */ void __ns_ref_active_get(struct ns_common *ns) { int prev; /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; /* If we didn't resurrect the namespace we're done. */ prev = atomic_fetch_add(1, &ns->__ns_ref_active); VFS_WARN_ON_ONCE(prev < 0); if (likely(prev)) return; /* * We did resurrect it. Walk the ownership hierarchy upwards * until we found an owning user namespace that is active. */ for (;;) { ns = ns_owner(ns); if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); prev = atomic_fetch_add(1, &ns->__ns_ref_active); VFS_WARN_ON_ONCE(prev < 0); if (likely(prev)) return; } }
22 9 66 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 #undef TRACE_SYSTEM #define TRACE_SYSTEM bridge #if !defined(_TRACE_BRIDGE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BRIDGE_H #include <linux/netdevice.h> #include <linux/tracepoint.h> #include "../../../net/bridge/br_private.h" TRACE_EVENT(br_fdb_add, TP_PROTO(struct ndmsg *ndm, struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags), TP_ARGS(ndm, dev, addr, vid, nlh_flags), TP_STRUCT__entry( __field(u8, ndm_flags) __string(dev, dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) __field(u16, nlh_flags) ), TP_fast_assign( __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->nlh_flags = nlh_flags; __entry->ndm_flags = ndm->ndm_flags; ), TP_printk("dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u nlh_flags %04x ndm_flags %02x", __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, __entry->nlh_flags, __entry->ndm_flags) ); TRACE_EVENT(br_fdb_external_learn_add, TP_PROTO(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid), TP_ARGS(br, p, addr, vid), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, p ? p->dev->name : "null") __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) ), TP_fast_assign( __assign_str(br_dev); __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; ), TP_printk("br_dev %s port %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid) ); TRACE_EVENT(fdb_delete, TP_PROTO(struct net_bridge *br, struct net_bridge_fdb_entry *f), TP_ARGS(br, f), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, f->dst ? f->dst->dev->name : "null") __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) ), TP_fast_assign( __assign_str(br_dev); __assign_str(dev); memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN); __entry->vid = f->key.vlan_id; ), TP_printk("br_dev %s dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid) ); TRACE_EVENT(br_fdb_update, TP_PROTO(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags), TP_ARGS(br, source, addr, vid, flags), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, source->dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) __field(unsigned long, flags) ), TP_fast_assign( __assign_str(br_dev); __assign_str(dev); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; __entry->flags = flags; ), TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u flags 0x%lx", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, __entry->flags) ); TRACE_EVENT(br_mdb_full, TP_PROTO(const struct net_device *dev, const struct br_ip *group), TP_ARGS(dev, group), TP_STRUCT__entry( __string(dev, dev->name) __field(int, af) __field(u16, vid) __array(__u8, src, 16) __array(__u8, grp, 16) __array(__u8, grpmac, ETH_ALEN) /* For af == 0. */ ), TP_fast_assign( struct in6_addr *in6; __assign_str(dev); __entry->vid = group->vid; if (!group->proto) { __entry->af = 0; memset(__entry->src, 0, sizeof(__entry->src)); memset(__entry->grp, 0, sizeof(__entry->grp)); memcpy(__entry->grpmac, group->dst.mac_addr, ETH_ALEN); } else if (group->proto == htons(ETH_P_IP)) { __entry->af = AF_INET; in6 = (struct in6_addr *)__entry->src; ipv6_addr_set_v4mapped(group->src.ip4, in6); in6 = (struct in6_addr *)__entry->grp; ipv6_addr_set_v4mapped(group->dst.ip4, in6); memset(__entry->grpmac, 0, ETH_ALEN); #if IS_ENABLED(CONFIG_IPV6) } else { __entry->af = AF_INET6; in6 = (struct in6_addr *)__entry->src; *in6 = group->src.ip6; in6 = (struct in6_addr *)__entry->grp; *in6 = group->dst.ip6; memset(__entry->grpmac, 0, ETH_ALEN); #endif } ), TP_printk("dev %s af %u src %pI6c grp %pI6c/%pM vid %u", __get_str(dev), __entry->af, __entry->src, __entry->grp, __entry->grpmac, __entry->vid) ); #endif /* _TRACE_BRIDGE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
18 1235 1233 1235 1236 7 3 3 3 3 3 3 3 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont * and Sakari Ailus <sakari.ailus@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support"); MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ struct caif_device_entry { struct cflayer layer; struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; spinlock_t flow_lock; struct sk_buff *xoff_skb; void (*xoff_skb_dtor)(struct sk_buff *skb); bool xoff; }; struct caif_device_entry_list { struct list_head list; /* Protects simulanous deletes in list */ struct mutex lock; }; struct caif_net { struct cfcnfg *cfg; struct caif_device_entry_list caifdevs; }; static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return &caifn->caifdevs; } static void caifd_put(struct caif_device_entry *e) { this_cpu_dec(*e->pcpu_refcnt); } static void caifd_hold(struct caif_device_entry *e) { this_cpu_inc(*e->pcpu_refcnt); } static int caifd_refcnt_read(struct caif_device_entry *e) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); return refcnt; } /* Allocate new CAIF device. */ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); if (!caifd->pcpu_refcnt) { kfree(caifd); return NULL; } caifd->netdev = dev; dev_hold(dev); return caifd; } static struct caif_device_entry *caif_get(struct net_device *dev) { struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; list_for_each_entry_rcu(caifd, &caifdevs->list, list, lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } return NULL; } static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; bool send_xoff; WARN_ON(skb->dev == NULL); rcu_read_lock(); caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); if (!caifd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) skb = NULL; caifd->xoff_skb = NULL; caifd->xoff_skb_dtor = NULL; spin_unlock_bh(&caifd->flow_lock); if (dtor && skb) dtor(skb); if (send_xoff) caifd->layer.up-> ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, caifd->layer.id); caifd_put(caifd); } static int transmit(struct cflayer *layer, struct cfpkt *pkt) { int err, high = 0, qlen = 0; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; struct netdev_queue *txq; rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { struct Qdisc *sch; /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); sch = rcu_dereference_bh(txq->qdisc); if (likely(qdisc_is_empty(sch))) goto noxoff; /* can check for explicit qdisc len value only !NOLOCK, * always set flow off otherwise */ high = (caifd->netdev->tx_queue_len * q_high) / 100; if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } /* Hold lock while accessing xoff */ spin_lock_bh(&caifd->flow_lock); if (caifd->xoff) { spin_unlock_bh(&caifd->flow_lock); goto noxoff; } /* * Handle flow off, we do this by temporary hi-jacking this * skb's destructor function, and replace it with our own * flow-on callback. The callback will set flow-on and call * the original destructor. */ pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; spin_unlock_bh(&caifd->flow_lock); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); noxoff: rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) err = -EIO; return err; } /* * Stuff received packets into the CAIF stack. * On error, returns non-zero and releases the skb. */ static int receive(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct cfpkt *pkt; struct caif_device_entry *caifd; int err; pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || !netif_oper_up(caifd->netdev)) { rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /* Hold reference to netdevice while using CAIF stack */ caifd_hold(caifd); rcu_read_unlock(); err = caifd->layer.up->receive(caifd->layer.up, pkt); /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); /* Release reference to stack upwards */ caifd_put(caifd); if (err != 0) err = NET_RX_DROP; return err; } static struct packet_type caif_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_CAIF), .func = receive, }; static void dev_flowctrl(struct net_device *dev, int on) { struct caif_device_entry *caifd; rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, on ? _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); caifd_put(caifd); } int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); switch (caifdev->link_select) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: pref = CFPHYPREF_HIGH_BW; break; } mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); strscpy(caifd->layer.name, dev->name, sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, link_support, caifdev->use_fcs, head_room); mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; return res; } EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) return 0; switch (what) { case NETDEV_REGISTER: if (caifd != NULL) break; caifdev = netdev_priv(dev); link_support = NULL; if (caifdev->use_frag) { head_room = 1; link_support = cfserl_create(dev->ifindex, caifdev->use_stx); if (!link_support) { pr_warn("Out of memory\n"); break; } } res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); if (res) cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: rcu_read_lock(); caifd = caif_get(dev); if (caifd == NULL) { rcu_read_unlock(); break; } caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); break; case NETDEV_DOWN: rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return -EINVAL; } cfcnfg_set_phy_state(cfg, &caifd->layer, false); caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); spin_lock_bh(&caifd->flow_lock); /* * Replace our xoff-destructor with original destructor. * We trust that skb->destructor *always* is called before * the skb reference is invalid. The hijacked SKB destructor * takes the flow_lock so manipulating the skb->destructor here * should be safe. */ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; case NETDEV_UNREGISTER: mutex_lock(&caifdevs->lock); caifd = caif_get(dev); if (caifd == NULL) { mutex_unlock(&caifdevs->lock); break; } list_del_rcu(&caifd->list); /* * NETDEV_UNREGISTER is called repeatedly until all reference * counts for the net-device are released. If references to * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for * the next call to NETDEV_UNREGISTER. * * If any packets are in flight down the CAIF Stack, * cfcnfg_del_phy_layer will return nonzero. * If no packets are in flight, the CAIF Stack associated * with the net-device un-registering is freed. */ if (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { pr_info("Wait for device inuse\n"); /* Enrole device if CAIF Stack is still in use */ list_add_rcu(&caifd->list, &caifdevs->list); mutex_unlock(&caifdevs->lock); break; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); mutex_unlock(&caifdevs->lock); break; } return 0; } static struct notifier_block caif_device_notifier = { .notifier_call = caif_device_notify, .priority = 0, }; /* Per-namespace Caif devices handling */ static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); caifn->cfg = cfcnfg_create(); if (!caifn->cfg) return -ENOMEM; return 0; } static void caif_exit_net(struct net *net) { struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); rtnl_lock(); mutex_lock(&caifdevs->lock); list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); cfcnfg_set_phy_state(cfg, &caifd->layer, false); while (i < 10 && (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { pr_info("Wait for device inuse\n"); msleep(250); i++; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); } cfcnfg_remove(cfg); mutex_unlock(&caifdevs->lock); rtnl_unlock(); } static struct pernet_operations caif_net_ops = { .init = caif_init_net, .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), }; /* Initialize Caif devices list */ static int __init caif_device_init(void) { int result; result = register_pernet_subsys(&caif_net_ops); if (result) return result; register_netdevice_notifier(&caif_device_notifier); dev_add_pack(&caif_packet_type); return result; } static void __exit caif_device_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); module_exit(caif_device_exit);
4 4 4 4 4 4 104 1 2 1 1 1 77 77 1 20 1 93 157 2 55 104 146 107 30 137 146 145 43 2 4 2 2 2 2 15 3 2 3 3 2 2 39 39 2 36 19 19 19 19 19 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC implementation * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/key-type.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/af_rxrpc.h> #define CREATE_TRACE_POINTS #include "ar-internal.h" MODULE_DESCRIPTION("RxRPC network protocol"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_RXRPC); unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; module_param_named(debug, rxrpc_debug, uint, 0644); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; /* current debugging ID */ atomic_t rxrpc_debug_id; EXPORT_SYMBOL(rxrpc_debug_id); /* count of skbs currently in use */ atomic_t rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; static void rxrpc_sock_destructor(struct sock *); /* * see if an RxRPC socket is currently writable */ static inline int rxrpc_writable(struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf; } /* * wait for write bufferage to become available */ static void rxrpc_write_space(struct sock *sk) { _enter("%p", sk); rcu_read_lock(); if (rxrpc_writable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* * validate an RxRPC address */ static int rxrpc_validate_address(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, int len) { unsigned int tail; if (len < sizeof(struct sockaddr_rxrpc)) return -EINVAL; if (srx->srx_family != AF_RXRPC) return -EAFNOSUPPORT; if (srx->transport_type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; len -= offsetof(struct sockaddr_rxrpc, transport); if (srx->transport_len < sizeof(sa_family_t) || srx->transport_len > len) return -EINVAL; switch (srx->transport.family) { case AF_INET: if (rx->family != AF_INET && rx->family != AF_INET6) return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (rx->family != AF_INET6) return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in6)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport) + sizeof(struct sockaddr_in6); break; #endif default: return -EAFNOSUPPORT; } if (tail < len) memset((void *)srx + tail, 0, len - tail); _debug("INET: %pISp", &srx->transport); return 0; } /* * bind a local address to an RxRPC socket */ static int rxrpc_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); u16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); ret = rxrpc_validate_address(rx, srx, len); if (ret < 0) goto error; service_id = srx->srx_service; lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: rx->srx = *srx; local = rxrpc_lookup_local(sock_net(&rx->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } if (service_id) { write_lock(&local->services_lock); if (local->service) goto service_in_use; rx->local = local; local->service = rx; write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; } break; case RXRPC_SERVER_BOUND: ret = -EINVAL; if (service_id == 0) goto error_unlock; ret = -EADDRINUSE; if (service_id == rx->srx.srx_service) goto error_unlock; ret = -EINVAL; srx->srx_service = rx->srx.srx_service; if (memcmp(srx, &rx->srx, sizeof(*srx)) != 0) goto error_unlock; rx->second_service = service_id; rx->sk.sk_state = RXRPC_SERVER_BOUND2; break; default: ret = -EINVAL; goto error_unlock; } release_sock(&rx->sk); _leave(" = 0"); return 0; service_in_use: write_unlock(&local->services_lock); rxrpc_unuse_local(local, rxrpc_local_unuse_bind); rxrpc_put_local(local, rxrpc_local_put_bind); ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); error: _leave(" = %d", ret); return ret; } /* * set the number of pending calls permitted on a listening socket */ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); unsigned int max, old; int ret; _enter("%p,%d", rx, backlog); lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: ret = -EADDRNOTAVAIL; break; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_BOUND2: ASSERT(rx->local != NULL); max = READ_ONCE(rxrpc_max_backlog); ret = -EINVAL; if (backlog == INT_MAX) backlog = max; else if (backlog < 0 || backlog > max) break; old = sk->sk_max_ack_backlog; sk->sk_max_ack_backlog = backlog; ret = rxrpc_service_prealloc(rx, GFP_KERNEL); if (ret == 0) rx->sk.sk_state = RXRPC_SERVER_LISTENING; else sk->sk_max_ack_backlog = old; break; case RXRPC_SERVER_LISTENING: if (backlog == 0) { rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED; sk->sk_max_ack_backlog = 0; rxrpc_discard_prealloc(rx); ret = 0; break; } fallthrough; default: ret = -EBUSY; break; } release_sock(&rx->sk); _leave(" = %d", ret); return ret; } /** * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address * @sock: The socket through which it will be accessed * @srx: The network address * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified * address. * * Return: The peer record found with a reference, %NULL if no record is found * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); if (ret < 0) return ERR_PTR(ret); return rxrpc_lookup_peer(rx->local, srx, gfp); } EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer * @peer: The peer to get a reference on (may be NULL). * * Get a reference for a remote peer record (if not NULL). * * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL; } EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on * * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { rxrpc_put_peer(peer, rxrpc_peer_put_application); } EXPORT_SYMBOL(rxrpc_kernel_put_peer); /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call * @peer: The peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use * @tx_total_len: Total length of data to transmit during the call (or -1) * @hard_timeout: The maximum lifespan of the call in sec * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @service_id: The ID of the service to contact * @upgrade: Request service upgrade for call * @interruptibility: The call is interruptible, or can be canceled. * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. * * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id) { struct rxrpc_conn_parameters cp; struct rxrpc_call_params p; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); _enter(",,%x,%lx", key_serial(key), user_call_ID); if (WARN_ON_ONCE(peer->local != rx->local)) return ERR_PTR(-EIO); lock_sock(&rx->sk); if (!key) key = rx->key; if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ memset(&p, 0, sizeof(p)); p.user_call_ID = user_call_ID; p.tx_total_len = tx_total_len; p.interruptibility = interruptibility; p.kernel = true; p.timeouts.hard = hard_timeout; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; cp.key = key; cp.security_level = rx->min_sec_level; cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = service_id; call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; mutex_unlock(&call->user_mutex); } _leave(" = %p", call); return call; } EXPORT_SYMBOL(rxrpc_kernel_begin_call); /* * Dummy function used to stop the notifier talking to recvmsg(). */ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { } /** * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using * @sock: The socket the call is on * @call: The call to end * * Allow a kernel service to shut down a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, refcount_read(&call->ref)); mutex_lock(&call->user_mutex); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) { rxrpc_release_call(rxrpc_sk(sock->sk), call); /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); } EXPORT_SYMBOL(rxrpc_kernel_shutdown_call); /** * rxrpc_kernel_put_call - Release a reference to a call * @sock: The socket the call is on * @call: The call to put * * Drop the application's ref on an rxrpc call. */ void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call) { rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_put_call); /** * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. * * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) { if (!rxrpc_call_is_complete(call)) return true; if (call->completion != RXRPC_CALL_SUCCEEDED) return false; return !skb_queue_empty(&call->recvmsg_queue); } EXPORT_SYMBOL(rxrpc_kernel_check_life); /** * rxrpc_kernel_set_notifications - Set table of callback operations * @sock: The socket to install table upon * @app_ops: Callback operation table to set * * Allow a kernel service to set a table of event notifications on a socket. */ void rxrpc_kernel_set_notifications(struct socket *sock, const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); rx->app_ops = app_ops; } EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /* * connect an RxRPC socket * - this just targets it at a specific destination; no actual connection * negotiation takes place */ static int rxrpc_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); ret = rxrpc_validate_address(rx, srx, addr_len); if (ret < 0) { _leave(" = %d [bad addr]", ret); return ret; } lock_sock(&rx->sk); ret = -EISCONN; if (test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) goto error; switch (rx->sk.sk_state) { case RXRPC_UNBOUND: rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; break; case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: break; default: ret = -EBUSY; goto error; } rx->connect_srx = *srx; set_bit(RXRPC_SOCK_CONNECTED, &rx->flags); ret = 0; error: release_sock(&rx->sk); return ret; } /* * send a message through an RxRPC socket * - in a client this does a number of things: * - finds/sets up a connection for the security specified (if any) * - initiates a call (ID in control data) * - ends the request phase of a call (if MSG_MORE is not set) * - sends a call data packet * - may send an abort (abort code in control data) */ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter(",{%d},,%zu", rx->sk.sk_state, len); if (m->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (m->msg_name) { ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen); if (ret < 0) { _leave(" = %d [bad addr]", ret); return ret; } } lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: case RXRPC_CLIENT_UNBOUND: rx->srx.srx_family = AF_RXRPC; rx->srx.srx_service = 0; rx->srx.transport_type = SOCK_DGRAM; rx->srx.transport.family = rx->family; switch (rx->family) { case AF_INET: rx->srx.transport_len = sizeof(struct sockaddr_in); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: rx->srx.transport_len = sizeof(struct sockaddr_in6); break; #endif default: ret = -EAFNOSUPPORT; goto error_unlock; } local = rxrpc_lookup_local(sock_net(sock->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; fallthrough; case RXRPC_CLIENT_BOUND: if (!m->msg_name && test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { m->msg_name = &rx->connect_srx; m->msg_namelen = sizeof(rx->connect_srx); } fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: if (m->msg_flags & MSG_OOB) ret = rxrpc_sendmsg_oob(rx, m, len); else ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: ret = -EINVAL; goto error_unlock; } error_unlock: release_sock(&rx->sk); out: _leave(" = %d", ret); return ret; } int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) { if (sk->sk_state != RXRPC_UNBOUND) return -EISCONN; if (val > RXRPC_SECURITY_MAX) return -EINVAL; lock_sock(sk); rxrpc_sk(sk)->min_sec_level = val; release_sock(sk); return 0; } EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); /* * set RxRPC socket options */ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; _enter(",%d,%d,,%d", level, optname, optlen); lock_sock(&rx->sk); ret = -EOPNOTSUPP; if (level == SOL_RXRPC) { switch (optname) { case RXRPC_EXCLUSIVE_CONNECTION: ret = -EINVAL; if (optlen != 0) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; rx->exclusive = true; goto success; case RXRPC_SECURITY_KEY: ret = -EINVAL; if (rx->key) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_request_key(rx, optval, optlen); goto error; case RXRPC_SECURITY_KEYRING: ret = -EINVAL; if (rx->key) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_server_keyring(rx, optval, optlen); goto error; case RXRPC_MIN_SECURITY_LEVEL: ret = -EINVAL; if (optlen != sizeof(unsigned int)) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = copy_safe_from_sockptr(&min_sec_level, sizeof(min_sec_level), optval, optlen); if (ret) goto error; ret = -EINVAL; if (min_sec_level > RXRPC_SECURITY_MAX) goto error; rx->min_sec_level = min_sec_level; goto success; case RXRPC_UPGRADEABLE_SERVICE: ret = -EINVAL; if (optlen != sizeof(service_upgrade) || rx->service_upgrade.from != 0) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) goto error; ret = -EFAULT; if (copy_from_sockptr(service_upgrade, optval, sizeof(service_upgrade)) != 0) goto error; ret = -EINVAL; if ((service_upgrade[0] != rx->srx.srx_service || service_upgrade[1] != rx->second_service) && (service_upgrade[0] != rx->second_service || service_upgrade[1] != rx->srx.srx_service)) goto error; rx->service_upgrade.from = service_upgrade[0]; rx->service_upgrade.to = service_upgrade[1]; goto success; case RXRPC_MANAGE_RESPONSE: ret = -EINVAL; if (optlen != sizeof(unsigned int)) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = copy_safe_from_sockptr(&val, sizeof(val), optval, optlen); if (ret) goto error; ret = -EINVAL; if (val > 1) goto error; if (val) set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); else clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); goto success; default: break; } } success: ret = 0; error: release_sock(&rx->sk); return ret; } /* * Get socket options. */ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *_optlen) { int optlen; if (level != SOL_RXRPC) return -EOPNOTSUPP; if (get_user(optlen, _optlen)) return -EFAULT; switch (optname) { case RXRPC_SUPPORTED_CMSG: if (optlen < sizeof(int)) return -ETOOSMALL; if (put_user(RXRPC__SUPPORTED - 1, (int __user *)optval) || put_user(sizeof(int), _optlen)) return -EFAULT; return 0; default: return -EOPNOTSUPP; } } /* * permit an RxRPC socket to be polled */ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; sock_poll_wait(file, sock, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ if (!list_empty(&rx->recvmsg_q)) mask |= EPOLLIN | EPOLLRDNORM; /* the socket is writable if there is space to add new data to the * socket; there is no guarantee that any particular call in progress * on the socket may have space in the Tx ACK window */ if (rxrpc_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } /* * create an RxRPC socket */ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, int kern) { struct rxrpc_net *rxnet; struct rxrpc_sock *rx; struct sock *sk; _enter("%p,%d", sock, protocol); /* we support transport protocol UDP/UDP6 only */ if (protocol != PF_INET && IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6) return -EPROTONOSUPPORT; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; sk->sk_max_ack_backlog = 0; sk->sk_destruct = rxrpc_sock_destructor; rx = rxrpc_sk(sk); rx->family = protocol; rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); skb_queue_head_init(&rx->recvmsg_oobq); rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); spin_lock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); rxnet = rxrpc_net(sock_net(&rx->sk)); timer_reduce(&rxnet->peer_keepalive_timer, jiffies + 1); _leave(" = 0 [%p]", rx); return 0; } /* * Kill all the calls on a socket and shut it down. */ static int rxrpc_shutdown(struct socket *sock, int flags) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; _enter("%p,%d", sk, flags); if (flags != SHUT_RDWR) return -EOPNOTSUPP; if (sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } rxrpc_discard_prealloc(rx); release_sock(sk); return ret; } /* * Purge the out-of-band queue. */ static void rxrpc_purge_oob_queue(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); struct sk_buff *skb; while ((skb = skb_dequeue(&rx->recvmsg_oobq))) rxrpc_kernel_free_oob(skb); while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); rb_erase(&skb->rbnode, &rx->pending_oobq); rxrpc_kernel_free_oob(skb); } } /* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { printk("Attempt to release alive rxrpc socket: %p\n", sk); return; } } /* * release an RxRPC socket */ static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); /* declare the socket closed for business */ sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; /* We want to kill off all connections from a service socket * as fast as possible because we can't share these; client * sockets, on the other hand, can share an endpoint. */ switch (sk->sk_state) { case RXRPC_SERVER_BOUND: case RXRPC_SERVER_BOUND2: case RXRPC_SERVER_LISTENING: case RXRPC_SERVER_LISTEN_DISABLED: rx->local->service_closed = true; break; } spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); rx->local->service = NULL; write_unlock(&rx->local->services_lock); } /* try to flush out this socket */ rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); rxrpc_put_local(rx->local, rxrpc_local_put_release_sock); rx->local = NULL; key_put(rx->key); rx->key = NULL; key_put(rx->securities); rx->securities = NULL; sock_put(sk); _leave(" = 0"); return 0; } /* * release an RxRPC BSD socket on close() or equivalent */ static int rxrpc_release(struct socket *sock) { struct sock *sk = sock->sk; _enter("%p{%p}", sock, sk); if (!sk) return 0; sock->sk = NULL; return rxrpc_release_sock(sk); } /* * RxRPC network protocol */ static const struct proto_ops rxrpc_rpc_ops = { .family = PF_RXRPC, .owner = THIS_MODULE, .release = rxrpc_release, .bind = rxrpc_bind, .connect = rxrpc_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, .setsockopt = rxrpc_setsockopt, .getsockopt = rxrpc_getsockopt, .sendmsg = rxrpc_sendmsg, .recvmsg = rxrpc_recvmsg, .mmap = sock_no_mmap, }; static struct proto rxrpc_proto = { .name = "RXRPC", .owner = THIS_MODULE, .obj_size = sizeof(struct rxrpc_sock), .max_header = sizeof(struct rxrpc_wire_header), }; static const struct net_proto_family rxrpc_family_ops = { .family = PF_RXRPC, .create = rxrpc_create, .owner = THIS_MODULE, }; /* * initialise and register the RxRPC protocol */ static int __init af_rxrpc_init(void) { int ret = -1; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); ret = -ENOMEM; rxrpc_gen_version_string(); rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); if (!rxrpc_call_jar) { pr_notice("Failed to allocate call jar\n"); goto error_call_jar; } rxrpc_workqueue = alloc_ordered_workqueue("krxrpcd", WQ_HIGHPRI | WQ_MEM_RECLAIM); if (!rxrpc_workqueue) { pr_notice("Failed to allocate work queue\n"); goto error_work_queue; } ret = rxrpc_init_security(); if (ret < 0) { pr_crit("Cannot initialise security\n"); goto error_security; } ret = register_pernet_device(&rxrpc_net_ops); if (ret) goto error_pernet; ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { pr_crit("Cannot register protocol\n"); goto error_proto; } ret = sock_register(&rxrpc_family_ops); if (ret < 0) { pr_crit("Cannot register socket family\n"); goto error_sock; } ret = register_key_type(&key_type_rxrpc); if (ret < 0) { pr_crit("Cannot register client key type\n"); goto error_key_type; } ret = register_key_type(&key_type_rxrpc_s); if (ret < 0) { pr_crit("Cannot register server key type\n"); goto error_key_type_s; } ret = rxrpc_sysctl_init(); if (ret < 0) { pr_crit("Cannot register sysctls\n"); goto error_sysctls; } return 0; error_sysctls: unregister_key_type(&key_type_rxrpc_s); error_key_type_s: unregister_key_type(&key_type_rxrpc); error_key_type: sock_unregister(PF_RXRPC); error_sock: proto_unregister(&rxrpc_proto); error_proto: unregister_pernet_device(&rxrpc_net_ops); error_pernet: rxrpc_exit_security(); error_security: destroy_workqueue(rxrpc_workqueue); error_work_queue: kmem_cache_destroy(rxrpc_call_jar); error_call_jar: return ret; } /* * unregister the RxRPC protocol */ static void __exit af_rxrpc_exit(void) { _enter(""); rxrpc_sysctl_exit(); unregister_key_type(&key_type_rxrpc_s); unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); proto_unregister(&rxrpc_proto); unregister_pernet_device(&rxrpc_net_ops); ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); /* Make sure the local and peer records pinned by any dying connections * are released. */ rcu_barrier(); destroy_workqueue(rxrpc_workqueue); rxrpc_exit_security(); kmem_cache_destroy(rxrpc_call_jar); _leave(""); } module_init(af_rxrpc_init); module_exit(af_rxrpc_exit);
30046 8806 30040 11616 11615 15860 15858 15854 3820 3820 1231 3426 3430 3578 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void synchronize_rcu(void); struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); // Maximum number of unsigned long values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 /** * same_state_synchronize_rcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or * get_completed_synchronize_rcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); preempt_enable(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_LAZY void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); #else static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static __always_inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); #define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #define RCU_NOCB_LOCKDEP_WARN(c, s) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU // Bits for ->trc_reader_special.b.need_qs field. #define TRC_NEED_QS 0x1 // Task needs a quiescent state. #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /** * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states * @old_ts: jiffies at start of processing. * * This helper is for long-running softirq handlers, such as NAPI threads in * networking. The caller should initialize the variable passed in as @old_ts * at the beginning of the softirq handler. When invoked frequently, this macro * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will * provide both RCU and RCU-Tasks quiescent states. Note that this macro * modifies its old_ts argument. * * Because regions of code that have disabled softirq act as RCU read-side * critical sections, this macro should be invoked with softirq (and * preemption) enabled. * * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would * have more chance to invoke schedule() calls and provide necessary quiescent * states. As a contrast, calling cond_resched() only won't achieve the same * effect because cond_resched() does not provide RCU-Tasks quiescent states. */ #define rcu_softirq_qs_periodic(old_ts) \ do { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ time_after(jiffies, (old_ts) + HZ / 10)) { \ preempt_disable(); \ rcu_softirq_qs(); \ preempt_enable(); \ (old_ts) = jiffies; \ } \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_try_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_try_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } static inline int debug_lockdep_rcu_enabled(void) { return 0; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data..unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else // #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { } #endif // #else // #ifndef CONFIG_PREEMPT_RCU #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) // See RCU_LOCKDEP_WARN() for an explanation of the double call to // debug_lockdep_rcu_enabled(). static inline bool lockdep_assert_rcu_helper(bool c) { return debug_lockdep_rcu_enabled() && (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && debug_lockdep_rcu_enabled(); } /** * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() * * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. */ #define lockdep_assert_in_rcu_read_lock() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) /** * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() * * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. * Note that local_bh_disable() and friends do not suffice here, instead an * actual rcu_read_lock_bh() is required. */ #define lockdep_assert_in_rcu_read_lock_bh() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) /** * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() * * Splats if lockdep is enabled and there is no rcu_read_lock_sched() * in effect. Note that preempt_disable() and friends do not suffice here, * instead an actual rcu_read_lock_sched() is required. */ #define lockdep_assert_in_rcu_read_lock_sched() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) /** * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader * * Splats if lockdep is enabled and there is no RCU reader of any * type in effect. Note that regions of code protected by things like * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify * as RCU readers. * * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY * kernels that are not also built with PREEMPT_COUNT. But if you have * lockdep enabled, you might as well also enable PREEMPT_COUNT. */ #define lockdep_assert_in_rcu_reader() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ !lock_is_held(&rcu_bh_lock_map) && \ !lock_is_held(&rcu_sched_lock_map) && \ preemptible())) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) #define lockdep_assert_in_rcu_read_lock() do { } while (0) #define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) #define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) #define lockdep_assert_in_rcu_reader() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __unrcu_pointer(p, local) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(local)); \ }) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) #define __rcu_access_pointer(p, local, space) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) local = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * Within an RCU read-side critical section, there is little reason to * use rcu_access_pointer(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced * by later inattentive changes. In other words, assigning the * rcu_access_pointer() return value to a local variable results in an * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). However, * please note that starting in v5.0 kernels, vanilla RCU grace periods * wait for local_bh_disable() regions of code in addition to regions of * code demarked by rcu_read_lock() and rcu_read_unlock(). This means * that synchronize_rcu(), call_rcu, and friends all take not only * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). * However, please note that starting in v5.0 kernels, vanilla RCU grace * periods wait for preempt_disable() regions of code in addition to * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). * This means that synchronize_rcu(), call_rcu, and friends all take not * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_sched_held(), \ __rcu) /** * rcu_dereference_all_check() - rcu_dereference_all with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is similar to rcu_dereference_check(), but allows protection * by all forms of vanilla RCU readers, including preemption disabled, * bh-disabled, and interrupt-disabled regions of code. Note that "vanilla * RCU" excludes SRCU and the various Tasks RCU flavors. Please note * that this macro should not be backported to any Linux-kernel version * preceding v5.0 due to changes in synchronize_rcu() semantics prior * to that version. */ #define rcu_dereference_all_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_any_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_dereference_all() - fetch RCU-all-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_all(p) rcu_dereference_all_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Both synchronize_rcu() and call_rcu() also wait for regions of code * with preemption disabled, including regions of code with interrupts or * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. * This deadlock immunity also extends to the scheduler's runqueue * and priority-inheritance spinlocks, courtesy of the quiescent-state * deferral that is carried out when rcu_read_unlock() is invoked with * interrupts disabled. * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ __release(RCU); __rcu_read_unlock(); } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent to rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as an RCU * read-side critical section. However, please note that this equivalence * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent to rcu_read_lock(), but also disables preemption. * Read-side critical sections can also be introduced by anything else that * disables preemption, including local_irq_disable() and friends. However, * please note that the equivalence to rcu_read_lock() applies only to * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } static __always_inline void rcu_read_lock_dont_migrate(void) { if (IS_ENABLED(CONFIG_PREEMPT_RCU)) migrate_disable(); rcu_read_lock(); } static inline void rcu_read_unlock_migrate(void) { rcu_read_unlock(); if (IS_ENABLED(CONFIG_PREEMPT_RCU)) migrate_enable(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. In order to have a universal * callback function handling different offsets of rcu_head, the callback needs * to determine the starting address of the freed object, which can be a large * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * The object to be freed can be allocated either by kmalloc() or * kmem_cache_alloc(). * * Note that the allowable offset might decrease in the future. * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) #define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** * kfree_rcu_mightsleep() - kfree an object after a grace period. * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) /* * In mm/slab_common.c, no suitable header to include here. */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); /* * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the * comment of kfree_rcu() for details. */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) { \ BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; DEFINE_LOCK_GUARD_0(rcu, do { rcu_read_lock(); /* * sparse doesn't call the cleanup function, * so just release immediately and don't track * the context. We don't need to anyway, since * the whole point of the guard is to not need * the explicit unlock. */ __release(RCU); } while (0), rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "ratelimiter.h" #include <linux/siphash.h> #include <linux/mm.h> #include <linux/slab.h> #include <net/ip.h> static struct kmem_cache *entry_cache; static hsiphash_key_t key; static spinlock_t table_lock = __SPIN_LOCK_UNLOCKED("ratelimiter_table_lock"); static DEFINE_MUTEX(init_lock); static u64 init_refcnt; /* Protected by init_lock, hence not atomic. */ static atomic_t total_entries = ATOMIC_INIT(0); static unsigned int max_entries, table_size; static void wg_ratelimiter_gc_entries(struct work_struct *); static DECLARE_DEFERRABLE_WORK(gc_work, wg_ratelimiter_gc_entries); static struct hlist_head *table_v4; #if IS_ENABLED(CONFIG_IPV6) static struct hlist_head *table_v6; #endif struct ratelimiter_entry { u64 last_time_ns, tokens, ip; void *net; spinlock_t lock; struct hlist_node hash; struct rcu_head rcu; }; enum { PACKETS_PER_SECOND = 20, PACKETS_BURSTABLE = 5, PACKET_COST = NSEC_PER_SEC / PACKETS_PER_SECOND, TOKEN_MAX = PACKET_COST * PACKETS_BURSTABLE }; static void entry_free(struct rcu_head *rcu) { kmem_cache_free(entry_cache, container_of(rcu, struct ratelimiter_entry, rcu)); atomic_dec(&total_entries); } static void entry_uninit(struct ratelimiter_entry *entry) { hlist_del_rcu(&entry->hash); call_rcu(&entry->rcu, entry_free); } /* Calling this function with a NULL work uninits all entries. */ static void wg_ratelimiter_gc_entries(struct work_struct *work) { const u64 now = ktime_get_coarse_boottime_ns(); struct ratelimiter_entry *entry; struct hlist_node *temp; unsigned int i; for (i = 0; i < table_size; ++i) { spin_lock(&table_lock); hlist_for_each_entry_safe(entry, temp, &table_v4[i], hash) { if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) entry_uninit(entry); } #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_safe(entry, temp, &table_v6[i], hash) { if (unlikely(!work) || now - entry->last_time_ns > NSEC_PER_SEC) entry_uninit(entry); } #endif spin_unlock(&table_lock); if (likely(work)) cond_resched(); } if (likely(work)) queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); } bool wg_ratelimiter_allow(struct sk_buff *skb, struct net *net) { /* We only take the bottom half of the net pointer, so that we can hash * 3 words in the end. This way, siphash's len param fits into the final * u32, and we don't incur an extra round. */ const u32 net_word = (unsigned long)net; struct ratelimiter_entry *entry; struct hlist_head *bucket; u64 ip; if (skb->protocol == htons(ETH_P_IP)) { ip = (u64 __force)ip_hdr(skb)->saddr; bucket = &table_v4[hsiphash_2u32(net_word, ip, &key) & (table_size - 1)]; } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { /* Only use 64 bits, so as to ratelimit the whole /64. */ memcpy(&ip, &ipv6_hdr(skb)->saddr, sizeof(ip)); bucket = &table_v6[hsiphash_3u32(net_word, ip >> 32, ip, &key) & (table_size - 1)]; } #endif else return false; rcu_read_lock(); hlist_for_each_entry_rcu(entry, bucket, hash) { if (entry->net == net && entry->ip == ip) { u64 now, tokens; bool ret; /* Quasi-inspired by nft_limit.c, but this is actually a * slightly different algorithm. Namely, we incorporate * the burst as part of the maximum tokens, rather than * as part of the rate. */ spin_lock(&entry->lock); now = ktime_get_coarse_boottime_ns(); tokens = min_t(u64, TOKEN_MAX, entry->tokens + now - entry->last_time_ns); entry->last_time_ns = now; ret = tokens >= PACKET_COST; entry->tokens = ret ? tokens - PACKET_COST : tokens; spin_unlock(&entry->lock); rcu_read_unlock(); return ret; } } rcu_read_unlock(); if (atomic_inc_return(&total_entries) > max_entries) goto err_oom; entry = kmem_cache_alloc(entry_cache, GFP_KERNEL); if (unlikely(!entry)) goto err_oom; entry->net = net; entry->ip = ip; INIT_HLIST_NODE(&entry->hash); spin_lock_init(&entry->lock); entry->last_time_ns = ktime_get_coarse_boottime_ns(); entry->tokens = TOKEN_MAX - PACKET_COST; spin_lock(&table_lock); hlist_add_head_rcu(&entry->hash, bucket); spin_unlock(&table_lock); return true; err_oom: atomic_dec(&total_entries); return false; } int wg_ratelimiter_init(void) { mutex_lock(&init_lock); if (++init_refcnt != 1) goto out; entry_cache = KMEM_CACHE(ratelimiter_entry, 0); if (!entry_cache) goto err; /* xt_hashlimit.c uses a slightly different algorithm for ratelimiting, * but what it shares in common is that it uses a massive hashtable. So, * we borrow their wisdom about good table sizes on different systems * dependent on RAM. This calculation here comes from there. */ table_size = (totalram_pages() > (1U << 30) / PAGE_SIZE) ? 8192 : max_t(unsigned long, 16, roundup_pow_of_two( (totalram_pages() << PAGE_SHIFT) / (1U << 14) / sizeof(struct hlist_head))); max_entries = table_size * 8; table_v4 = kvcalloc(table_size, sizeof(*table_v4), GFP_KERNEL); if (unlikely(!table_v4)) goto err_kmemcache; #if IS_ENABLED(CONFIG_IPV6) table_v6 = kvcalloc(table_size, sizeof(*table_v6), GFP_KERNEL); if (unlikely(!table_v6)) { kvfree(table_v4); goto err_kmemcache; } #endif queue_delayed_work(system_power_efficient_wq, &gc_work, HZ); get_random_bytes(&key, sizeof(key)); out: mutex_unlock(&init_lock); return 0; err_kmemcache: kmem_cache_destroy(entry_cache); err: --init_refcnt; mutex_unlock(&init_lock); return -ENOMEM; } void wg_ratelimiter_uninit(void) { mutex_lock(&init_lock); if (!init_refcnt || --init_refcnt) goto out; cancel_delayed_work_sync(&gc_work); wg_ratelimiter_gc_entries(NULL); rcu_barrier(); kvfree(table_v4); #if IS_ENABLED(CONFIG_IPV6) kvfree(table_v6); #endif kmem_cache_destroy(entry_cache); out: mutex_unlock(&init_lock); } #include "selftest/ratelimiter.c"
22 23 7 34 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ #include "main.h" #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> /* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same */ typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); /* the hashfunction * * Return: an index based on the key in the data of the first argument and the * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); /** * struct batadv_hashtable - Wrapper of simple hlist based hashtable */ struct batadv_hashtable { /** @table: the hashtable itself with the buckets */ struct hlist_head *table; /** @list_locks: spinlock for each hash list entry */ spinlock_t *list_locks; /** @size: size of hashtable */ u32 size; /** @generation: current (generation) sequence number */ atomic_t generation; }; /* allocates and clears the hash */ struct batadv_hashtable *batadv_hash_new(u32 size); /* set class key for all locks */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key); /* free only the hashtable and the hash itself. */ void batadv_hash_destroy(struct batadv_hashtable *hash); /** * batadv_hash_add() - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, const void *data, struct hlist_node *data_node) { u32 index; int ret = -1; struct hlist_head *head; struct hlist_node *node; spinlock_t *list_lock; /* spinlock to protect write access */ if (!hash) goto out; index = choose(data, hash->size); head = &hash->table[index]; list_lock = &hash->list_locks[index]; spin_lock_bh(list_lock); hlist_for_each(node, head) { if (!compare(node, data)) continue; ret = 1; goto unlock; } /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); atomic_inc(&hash->generation); ret = 0; unlock: spin_unlock_bh(list_lock); out: return ret; } /** * batadv_hash_remove() - Removes data from hash, if found * @hash: hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * * ata could be the structure you use with just the key filled, we just need * the key for comparing. * * Return: returns pointer do data on success, so you can remove the used * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, void *data) { u32 index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; index = choose(data, hash->size); head = &hash->table[index]; spin_lock_bh(&hash->list_locks[index]); hlist_for_each(node, head) { if (!compare(node, data)) continue; data_save = node; hlist_del_rcu(node); atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); return data_save; } #endif /* _NET_BATMAN_ADV_HASH_H_ */
4 1 3 3 1 8 8 5 5 5 3 5 5 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Basic Transport Functions exploiting Infiniband API * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/socket.h> #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/reboot.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include "smc.h" #include "smc_clc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_wr.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" #include "smc_stats.h" #include "smc_tracepoint.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, }; static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) { if (lgr->is_smcd) { *lgr_lock = &lgr->smcd->lgr_lock; return &lgr->smcd->lgr_list; } *lgr_lock = &smc_lgr_list.lock; return &smc_lgr_list.list; } static void smc_ibdev_cnt_inc(struct smc_link *lnk) { atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_ibdev_cnt_dec(struct smc_link *lnk) { atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); } static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { mod_delayed_work(system_percpu_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } } /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock * @smc connection to register * Returns 0 on success, != otherwise. */ static void smc_lgr_add_alert_token(struct smc_connection *conn) { struct rb_node **link, *parent = NULL; u32 token = conn->alert_token_local; link = &conn->lgr->conns_all.rb_node; while (*link) { struct smc_connection *cur = rb_entry(*link, struct smc_connection, alert_node); parent = *link; if (cur->alert_token_local > token) link = &parent->rb_left; else link = &parent->rb_right; } /* Put the new node there */ rb_link_node(&conn->alert_node, parent, link); rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } /* assign an SMC-R link to the connection */ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) { enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : SMC_LNK_ACTIVE; int i, j; /* do link balancing */ conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; if (lnk->state != expected || lnk->link_is_asym) continue; if (conn->lgr->role == SMC_CLNT) { conn->lnk = lnk; /* temporary, SMC server assigns link*/ break; } if (conn->lgr->conns_num % 2) { for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { struct smc_link *lnk2; lnk2 = &conn->lgr->lnk[j]; if (lnk2->state == expected && !lnk2->link_is_asym) { conn->lnk = lnk2; break; } } } if (!conn->lnk) conn->lnk = lnk; break; } if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); return 0; } /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); int rc; if (!conn->lgr->is_smcd) { rc = smcr_lgr_conn_assign_link(conn, first); if (rc) { conn->lgr = NULL; return rc; } } /* find a new alert_token_local value not yet used by some connection * in this link group */ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ while (!conn->alert_token_local) { conn->alert_token_local = atomic_inc_return(&nexttoken); if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; return 0; } /* Unregister connection and reset the alert token of the given connection< */ static void __smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); if (conn->lnk) atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } /* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); } static void smc_lgr_buf_list_add(struct smc_link_group *lgr, bool is_rmb, struct list_head *buf_list, struct smc_buf_desc *buf_desc) { list_add(&buf_desc->list, buf_list); if (is_rmb) { lgr->alloc_rmbs += buf_desc->len; lgr->alloc_rmbs += lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; } else { lgr->alloc_sndbufs += buf_desc->len; } } static void smc_lgr_buf_list_del(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { list_del(&buf_desc->list); if (is_rmb) { lgr->alloc_rmbs -= buf_desc->len; lgr->alloc_rmbs -= lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; } else { lgr->alloc_sndbufs -= buf_desc->len; } } int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); char hostname[SMC_MAX_HOSTNAME_LEN + 1]; char smc_seid[SMC_MAX_EID_LEN + 1]; struct nlattr *attrs; u8 *seid = NULL; u8 *host = NULL; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_SYS_INFO); if (!nlh) goto errmsg; if (cb_ctx->pos[0]) goto errout; attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) goto errattr; smc_clc_get_hostname(&host); if (host) { memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); hostname[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) goto errattr; } if (smc_ism_is_v2_capable()) { smc_ism_get_system_eid(&seid); memcpy(smc_seid, seid, SMC_MAX_EID_LEN); smc_seid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; return skb->len; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } /* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, struct nlattr *v2_attrs) { char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; char smc_eid[SMC_MAX_EID_LEN + 1]; if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) goto errv2attr; memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); smc_host[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) goto errv2attr; memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); smc_eid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); return -EMSGSIZE; } static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); if (!v2_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; errv2attr: nla_nest_cancel(skb, v2_attrs); errattr: return -EMSGSIZE; } static int smc_nl_fill_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_target[SMC_MAX_PNETID_LEN + 1]; struct nlattr *attrs, *v2_attrs; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; if (nla_put_uint(skb, SMC_NLA_LGR_R_SNDBUF_ALLOC, lgr->alloc_sndbufs)) goto errattr; if (nla_put_uint(skb, SMC_NLA_LGR_R_RMB_ALLOC, lgr->alloc_rmbs)) goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) goto errattr; } nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, struct smc_link *link, struct sk_buff *skb, struct netlink_callback *cb) { char smc_ibname[IB_DEVICE_NAME_MAX]; u8 smc_gid_target[41]; struct nlattr *attrs; u32 link_uid = 0; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LINK_SMCR); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, atomic_read(&link->conn_cnt))) goto errattr; if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) goto errattr; snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) goto errattr; memcpy(&link_uid, link->link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) goto errattr; memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->gid); if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) goto errattr; memset(smc_gid_target, 0, sizeof(smc_gid_target)); smc_gid_be16_convert(smc_gid_target, link->peer_gid); if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { void *nlh; int i; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCR); if (!nlh) goto errmsg; if (smc_nl_fill_lgr(lgr, skb, cb)) goto errout; genlmsg_end(skb, nlh); if (!list_links) goto out; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_usable(&lgr->lnk[i])) continue; if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) goto errout; } out: return 0; errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, struct sk_buff *skb, struct netlink_callback *cb, bool list_links) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[0]; int num = 0; spin_lock_bh(&smc_lgr->lock); list_for_each_entry(lgr, &smc_lgr->list, list) { if (num < snum) goto next; if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) goto errout; next: num++; } errout: spin_unlock_bh(&smc_lgr->lock); cb_ctx->pos[0] = num; } static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smcd_dev *smcd = lgr->smcd; struct smcd_gid smcd_gid; struct nlattr *attrs; void *nlh; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_LGR_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); if (!attrs) goto errout; if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_EXT_GID, smcd_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_EXT_GID, lgr->peer_gid.gid_ext, SMC_NLA_LGR_D_PAD)) goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; if (nla_put_uint(skb, SMC_NLA_LGR_D_SNDBUF_ALLOC, lgr->alloc_sndbufs)) goto errattr; if (nla_put_uint(skb, SMC_NLA_LGR_D_DMB_ALLOC, lgr->alloc_rmbs)) goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; if (lgr->smc_version > SMC_V1) { struct nlattr *v2_attrs; v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); if (!v2_attrs) goto errattr; if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) goto errattr; } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smc_link_group *lgr; int snum = cb_ctx->pos[1]; int rc = 0, num = 0; spin_lock_bh(&dev->lgr_lock); list_for_each_entry(lgr, &dev->lgr_list, list) { if (!lgr->is_smcd) continue; if (num < snum) goto next; rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); if (rc) goto errout; next: num++; } errout: spin_unlock_bh(&dev->lgr_lock); cb_ctx->pos[1] = num; return rc; } static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct smcd_dev *smcd_dev; int snum = cb_ctx->pos[0]; int rc = 0, num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd_dev, &dev_list->list, list) { if (list_empty(&smcd_dev->lgr_list)) continue; if (num < snum) goto next; rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); if (rc) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; return rc; } int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = false; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) { bool list_links = true; smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); return skb->len; } int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } void smc_lgr_cleanup_early(struct smc_link_group *lgr) { spinlock_t *lgr_lock; if (!lgr) return; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ if (!list_empty(&lgr->list)) list_del_init(&lgr->list); spin_unlock_bh(lgr_lock); __smc_lgr_terminate(lgr, true); } static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_sendable(lnk)) lnk->state = SMC_LNK_INACTIVE; } wake_up_all(&lgr->llc_msg_waiter); wake_up_all(&lgr->llc_flow_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); spinlock_t *lgr_lock; bool conns; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (lgr->freeing) { spin_unlock_bh(lgr_lock); return; } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ spin_unlock_bh(lgr_lock); return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); if (!lgr->is_smcd && !lgr->terminating) smc_llc_send_link_delete_all(lgr, true, SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); if (!lgr->is_smcd) smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } static void smc_lgr_terminate_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(work, struct smc_link_group, terminate_work); __smc_lgr_terminate(lgr, true); } /* return next unique link id for the lgr */ static u8 smcr_next_link_id(struct smc_link_group *lgr) { u8 link_id; int i; while (1) { again: link_id = ++lgr->next_link_id; if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) goto again; } break; } return link_id; } static void smcr_copy_dev_info_to_link(struct smc_link *link) { struct smc_ib_device *smcibdev = link->smcibdev; snprintf(link->ibname, sizeof(link->ibname), "%s", smcibdev->ibdev->name); link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; } int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { struct smc_ib_device *smcibdev; u8 rndvec[3]; int rc; if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; lnk->wr_rx_sge_cnt = 1; lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->max_send_wr = lgr->max_send_wr; lnk->max_recv_wr = lgr->max_recv_wr; lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); if (rc) goto out; } get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, ini->vlan_id, lnk->gid, &lnk->sgid_index, lgr->smc_version == SMC_V2 ? &ini->smcrv2 : NULL); if (rc) goto out; rc = smc_llc_link_init(lnk); if (rc) goto out; rc = smc_ib_create_protection_domain(lnk); if (rc) goto clear_llc_lnk; do { rc = smc_ib_create_queue_pair(lnk); if (rc) goto dealloc_pd; rc = smc_wr_alloc_link_mem(lnk); if (!rc) break; else if (rc != -ENOMEM) /* give up */ goto destroy_qp; /* retry with smaller ... */ lnk->max_send_wr /= 2; lnk->max_recv_wr /= 2; /* ... unless droping below old SMC_WR_BUF_SIZE */ if (lnk->max_send_wr < 16 || lnk->max_recv_wr < 48) goto destroy_qp; smc_ib_destroy_queue_pair(lnk); } while (1); rc = smc_wr_create_link(lnk); if (rc) goto free_link_mem; lnk->state = SMC_LNK_ACTIVATING; return 0; free_link_mem: smc_wr_free_link_mem(lnk); destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold above */ return rc; } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; int rc = 0; int i; if (ini->is_smcd && ini->vlan_id) { if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id)) { rc = SMC_CLC_DECL_ISMVLANERR; goto out; } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; goto free_lgr; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; lgr->terminating = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ init_rwsem(&lgr->sndbufs_lock); init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = ini->ism_peer_gid[ini->ism_selected].gid_ext; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; lgr->smc_version = ini->smcd_version; lgr->peer_shutdown = 0; atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ struct smc_ib_device *ibdev; int ibport; lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->smc_version = ini->smcr_version; memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); if (lgr->smc_version == SMC_V2) { ibdev = ini->smcrv2.ib_dev_v2; ibport = ini->smcrv2.ib_port_v2; lgr->saddr = ini->smcrv2.saddr; lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); lgr->max_conns = ini->max_conns; lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; lgr->max_conns = SMC_CONN_PER_LGR_MAX; lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); rc = smc_wr_alloc_lgr_mem(lgr); if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { smc_wr_free_lgr_mem(lgr); goto free_wq; } lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); return 0; free_wq: destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); ism_put_vlan: if (ini->is_smcd && ini->vlan_id) smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) rc = SMC_CLC_DECL_MEM; else rc = SMC_CLC_DECL_INTERR; } return rc; } static int smc_write_space(struct smc_connection *conn) { int buffer_len = conn->peer_rmbe_size; union smc_host_cursor prod; union smc_host_cursor cons; int space; smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* determine rx_buf space */ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); return space; } static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, struct smc_wr_buf *wr_buf) { struct smc_connection *conn = &smc->conn; union smc_host_cursor cons, fin; int rc = 0; int diff; smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); /* set prod cursor to old state, enforce tx_rdma_writes() */ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { /* cons cursor advanced more than fin, and prod was set * fin above, so now prod is smaller than cons. Fix that. */ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_sent, diff); smc_curs_add(conn->sndbuf_desc->len, &conn->tx_curs_fin, diff); smp_mb__before_atomic(); atomic_add(diff, &conn->sndbuf_space); smp_mb__after_atomic(); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl.prod, diff); smc_curs_add(conn->peer_rmbe_size, &conn->local_tx_ctrl_fin, diff); } /* recalculate, value is used by tx_rdma_writes() */ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); if (smc->sk.sk_state != SMC_INIT && smc->sk.sk_state != SMC_CLOSED) { rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); if (!rc) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); smc->sk.sk_data_ready(&smc->sk); } } else { smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); } return rc; } void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); /* link_hold in smc_conn_create() */ smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); /* link_put in smc_conn_free() */ smcr_link_hold(conn->lnk); } struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err) { struct smc_link *to_lnk = NULL; struct smc_cdc_tx_pend *pend; struct smc_connection *conn; struct smc_wr_buf *wr_buf; struct smc_sock *smc; struct rb_node *node; int i, rc = 0; /* link is inactive, wake up tx waiters */ smc_wr_wakeup_tx_wait(from_lnk); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) continue; if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && from_lnk->ibport == lgr->lnk[i].ibport) { continue; } to_lnk = &lgr->lnk[i]; break; } if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } again: read_lock_bh(&lgr->conns_lock); for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { conn = rb_entry(node, struct smc_connection, alert_node); if (conn->lnk != from_lnk) continue; smc = container_of(conn, struct smc_sock, conn); /* conn->lnk not yet set in SMC_INIT state */ if (smc->sk.sk_state == SMC_INIT) continue; if (smc->sk.sk_state == SMC_CLOSED || smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || smc->sk.sk_state == SMC_APPFINCLOSEWAIT || smc->sk.sk_state == SMC_APPCLOSEWAIT1 || smc->sk.sk_state == SMC_APPCLOSEWAIT2 || smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || smc->sk.sk_state == SMC_PEERABORTWAIT || smc->sk.sk_state == SMC_PROCESSABORT) { spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); spin_unlock_bh(&conn->send_lock); continue; } sock_hold(&smc->sk); read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); if (rc) goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); if (rc) goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); smc_wr_tx_link_put(to_lnk); return to_lnk; err_out: smcr_link_down_cond_sched(to_lnk); smc_wr_tx_link_put(to_lnk); return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { /* unregister rmb with peer */ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } if (buf_desc->is_reg_err) { /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; down_write(lock); smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { /* memzero_explicit provides potential memory barrier semantics */ memzero_explicit(buf_desc->cpu_addr, buf_desc->len); WRITE_ONCE(buf_desc->used, 0); } } static void smcd_buf_detach(struct smc_connection *conn) { struct smcd_dev *smcd = conn->lgr->smcd; u64 peer_token = conn->peer_token; if (!conn->sndbuf_desc) return; smc_ism_detach_dmb(smcd, peer_token); kfree(conn->sndbuf_desc); conn->sndbuf_desc = NULL; } static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); bool is_smcd = lgr->is_smcd; int bufsize; if (conn->sndbuf_desc) { bufsize = conn->sndbuf_desc->len; if (!is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize); WRITE_ONCE(conn->sndbuf_desc->used, 0); } SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize); } if (conn->rmb_desc) { bufsize = conn->rmb_desc->len; if (!is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { bufsize += sizeof(struct smcd_cdc_msg); memzero_explicit(conn->rmb_desc->cpu_addr, bufsize); WRITE_ONCE(conn->rmb_desc->used, 0); } SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize); } } /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!lgr || conn->freed) /* Connection has never been registered in a * link group, or has already been freed. */ return; conn->freed = 1; if (!smc_conn_lgr_valid(conn)) /* Connection has already unregistered from * link group. */ goto lgr_put; if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); if (smc_ism_support_dmb_nocopy(lgr->smcd)) smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); if (current_work() != &conn->abort_work) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_buf_unuse(conn, lgr); /* allow buffer reuse */ smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); lgr_put: if (!lgr->is_smcd) smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { if (is_rmb || buf_desc->is_vm) buf_desc->is_reg_mr[lnk->link_idx] = false; if (!buf_desc->is_map_ib[lnk->link_idx]) return; if ((is_rmb || buf_desc->is_vm) && buf_desc->mr[lnk->link_idx]) { smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); buf_desc->mr[lnk->link_idx] = NULL; } if (is_rmb) smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); else smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); sg_free_table(&buf_desc->sgt[lnk->link_idx]); buf_desc->is_map_ib[lnk->link_idx] = false; } /* unmap all buffers of lgr for a deleted link */ static void smcr_buf_unmap_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); up_write(&lgr->rmbs_lock); down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); up_write(&lgr->sndbufs_lock); } } static void smcr_rtoken_clear_link(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { lgr->rtokens[i][lnk->link_idx].rkey = 0; lgr->rtokens[i][lnk->link_idx].dma_addr = 0; } } static void __smcr_link_clear(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_ib_device *smcibdev; smc_wr_free_link_mem(lnk); smc_ibdev_cnt_dec(lnk); put_device(&lnk->smcibdev->ibdev->dev); smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { if (!lnk->lgr || lnk->clearing || lnk->state == SMC_LNK_UNUSED) return; lnk->clearing = 1; lnk->peer_qpn = 0; smc_llc_link_clear(lnk, log); smcr_buf_unmap_lgr(lnk); smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smcr_link_put(lnk); /* theoretically last link_put */ } void smcr_link_hold(struct smc_link *lnk) { refcount_inc(&lnk->refcnt); } void smcr_link_put(struct smc_link *lnk) { if (refcount_dec_and_test(&lnk->refcnt)) __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); else if (buf_desc->is_vm && buf_desc->cpu_addr) vfree(buf_desc->cpu_addr); kfree(buf_desc); } static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, struct smc_buf_desc *buf_desc) { if (is_dmb) { /* restore original buf len */ buf_desc->len += sizeof(struct smcd_cdc_msg); smc_ism_unregister_dmb(lgr->smcd, buf_desc); } else { kfree(buf_desc->cpu_addr); } kfree(buf_desc); } static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { if (lgr->is_smcd) smcd_buf_free(lgr, is_rmb, buf_desc); else smcr_buf_free(lgr, is_rmb, buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { if (is_rmb) buf_list = &lgr->rmbs[i]; else buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); smc_buf_free(lgr, is_rmb, buf_desc); } } } static void smc_lgr_free_bufs(struct smc_link_group *lgr) { /* free send buffers */ __smc_lgr_free_bufs(lgr, false); /* free rmbs */ __smc_lgr_free_bufs(lgr, true); } /* won't be freed until no one accesses to lgr anymore */ static void __smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { int i; if (!lgr->is_smcd) { down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } void smc_lgr_hold(struct smc_link_group *lgr) { refcount_inc(&lgr->refcnt); } void smc_lgr_put(struct smc_link_group *lgr) { if (refcount_dec_and_test(&lgr->refcnt)) __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); smc->sk.sk_data_ready(&smc->sk); smc->sk.sk_state_change(&smc->sk); } /* kill a connection */ static void smc_conn_kill(struct smc_connection *conn, bool soft) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else smc_close_abort(conn); conn->killed = 1; smc->sk.sk_err = ECONNABORTED; smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else tasklet_unlock_wait(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); } static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); } else { u32 rsn = lgr->llc_termination_rsn; if (!rsn) rsn = SMC_LLC_DEL_PROG_INIT_TERM; smc_llc_send_link_delete_all(lgr, false, rsn); smcr_lgr_link_deactivate_all(lgr); } } /* terminate link group * @soft: true if link group shutdown can take its time * false if immediate link group shutdown is required */ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put below */ lock_sock(&smc->sk); smc_conn_kill(conn, soft); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold above */ read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); smc_lgr_free(lgr); } /* unlink link group and schedule termination */ void smc_lgr_terminate_sched(struct smc_link_group *lgr) { spinlock_t *lgr_lock; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) { spin_unlock_bh(lgr_lock); return; /* lgr already terminating */ } list_del_init(&lgr->list); lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ spin_lock_bh(&dev->lgr_lock); list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid->gid || (lgr->peer_gid.gid == peer_gid->gid && !smc_ism_is_emulated(dev) ? 1 : lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { if (peer_gid->gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); schedule_work(&lgr->terminate_work); } } /* Called when an SMCD device is removed or the smc module is unloaded */ void smc_smcd_terminate_all(struct smcd_dev *smcd) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); spin_lock_bh(&smcd->lgr_lock); list_splice_init(&smcd->lgr_list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; spin_unlock_bh(&smcd->lgr_lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); __smc_lgr_terminate(lgr, false); } if (atomic_read(&smcd->lgr_cnt)) wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); } /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { list_splice_init(&smc_lgr_list.list, &lgr_free_list); list_for_each_entry(lgr, &lgr_free_list, list) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) smcr_link_down_cond_sched(&lgr->lnk[i]); } } } spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, !atomic_read(&smcibdev->lnk_cnt)); } else { if (atomic_read(&lgr_cnt)) wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } /* set new lgr type and clear all asymmetric link tagging */ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) { char *lgr_type = ""; int i; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) if (smc_link_usable(&lgr->lnk[i])) lgr->lnk[i].link_is_asym = false; if (lgr->type == new_type) return; lgr->type = new_type; switch (lgr->type) { case SMC_LGR_NONE: lgr_type = "NONE"; break; case SMC_LGR_SINGLE: lgr_type = "SINGLE"; break; case SMC_LGR_SYMMETRIC: lgr_type = "SYMMETRIC"; break; case SMC_LGR_ASYMMETRIC_PEER: lgr_type = "ASYMMETRIC_PEER"; break; case SMC_LGR_ASYMMETRIC_LOCAL: lgr_type = "ASYMMETRIC_LOCAL"; break; } pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, lgr_type, lgr->pnet_id); } /* set new lgr type and tag a link as asymmetric */ void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx) { smcr_lgr_set_type(lgr, new_type); lgr->lnk[asym_lnk_idx].link_is_asym = true; } /* abort connection, abort_work scheduled from tasklet context */ static void smc_conn_abort_work(struct work_struct *work) { struct smc_connection *conn = container_of(work, struct smc_connection, abort_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_conn_kill(conn, true); release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) smc_llc_add_link_local(link); } spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, * must be called under lgr->llc_conf_mutex lock */ static void smcr_link_down(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_link *to_lnk; int del_link_id; if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) return; to_lnk = smc_switch_conns(lgr, lnk, true); if (!to_lnk) { /* no backup link available */ smcr_link_clear(lnk, true); return; } smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); del_link_id = lnk->link_id; if (lgr->role == SMC_SERV) { /* trigger local delete link processing */ smc_llc_srv_delete_link_local(to_lnk, del_link_id); } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_LOST_PATH); smcr_link_clear(lnk, true); } wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_down_cond(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); smcr_link_down(lnk); } } /* will get the lgr->llc_conf_mutex lock */ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ if (!schedule_work(&lnk->link_down_wrk)) smcr_link_put(lnk); } } void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; int i; list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN)) continue; /* lgr is not affected */ if (list_empty(&lgr->list)) continue; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (smc_link_usable(lnk) && lnk->smcibdev == smcibdev && lnk->ibport == ibport) smcr_link_down_cond_sched(lnk); } } } static void smc_link_down_work(struct work_struct *work) { struct smc_link *link = container_of(work, struct smc_link, link_down_wrk); struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); out: smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { unsigned short *vlan_id = (unsigned short *)priv->data; if (is_vlan_dev(lower_dev)) { *vlan_id = vlan_dev_vlan_id(lower_dev); return 1; } return 0; } /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct netdev_nested_priv priv; struct net_device *ndev; struct dst_entry *dst; int rc = 0; ini->vlan_id = 0; rcu_read_lock(); dst = __sk_dst_get(clcsock->sk); ndev = dst ? dst_dev_rcu(dst) : NULL; if (!ndev) { rc = -ENODEV; goto out; } if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); goto out; } priv.data = (void *)&ini->vlan_id; netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv); out: rcu_read_unlock(); return rc; } static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], enum smc_lgr_role role, u32 clcqpn, struct net *net) { struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || lgr->role != role) return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { lnk = &lgr->lnk[i]; if (!smc_link_active(lnk)) continue; /* use verbs API to check netns, instead of lgr->net */ if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) return false; if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, struct smcd_dev *smcismdev, struct smcd_gid *peer_gid) { if (lgr->peer_gid.gid != peer_gid->gid || lgr->smcd != smcismdev) return false; if (smc_ism_is_emulated(smcismdev) && lgr->peer_gid.gid_ext != peer_gid->gid_ext) return false; return true; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; int rc = 0; lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : &smc_lgr_list.list; lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock : &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], &ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, ini->ib_clcqpn, net)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); if (!rc && delayed_work_pending(&lgr->free_work)) cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); if (rc) return rc; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ return SMC_CLC_DECL_SYNCERR; } create: if (ini->first_contact_local) { rc = smc_lgr_create(smc, ini); if (rc) goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { smc_lgr_cleanup_early(lgr); goto out; } } smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ if (!conn->lgr->is_smcd) smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; init_waitqueue_head(&conn->cdc_pend_tx_wq); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } else { conn->rx_off = 0; } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif out: return rc; } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ #define SMCR_RMBE_SIZES 15 /* 0 -> 16KB, 1 -> 32KB, .. 15 -> 512MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; size = (size - 1) >> 14; /* convert to 16K multiple */ compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); #ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); #endif return compressed; } /* convert the RMB size from compressed notation into integer */ int smc_uncompress_bufsize(u8 compressed) { u32 size; size = 0x00000001 << (((int)compressed) + 14); return (int)size; } /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { up_read(lock); return buf_slot; } } up_read(lock); return NULL; } /* one of the conditions for announcing a receiver's current window size is * that it "results in a minimum increase in the window size of 10% of the * receive buffer space" [RFC7609] */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an buf to a link */ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link *lnk) { int rc, i, nents, offset, buf_size, size, access_flags; struct scatterlist *sg; void *buf; if (buf_desc->is_map_ib[lnk->link_idx]) return 0; if (buf_desc->is_vm) { buf = buf_desc->cpu_addr; buf_size = buf_desc->len; offset = offset_in_page(buf_desc->cpu_addr); nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; } else { nents = 1; } rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); if (rc) return rc; if (buf_desc->is_vm) { /* virtually contiguous buffer */ for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); buf += size; buf_size -= size; offset = 0; } } else { /* physically contiguous buffer */ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, buf_desc->cpu_addr, buf_desc->len); } /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != nents) { rc = -EAGAIN; goto free_table; } buf_desc->is_dma_need_sync |= smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; if (is_rmb || buf_desc->is_vm) { /* create a new memory region for the RMB or vzalloced sndbuf */ access_flags = is_rmb ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_LOCAL_WRITE; rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, buf_desc, lnk->link_idx); if (rc) goto buf_unmap; smc_ib_sync_sg_for_device(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } buf_desc->is_map_ib[lnk->link_idx] = true; return 0; buf_unmap: smc_ib_buf_unmap_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); free_table: sg_free_table(&buf_desc->sgt[lnk->link_idx]); return rc; } /* register a new buf on IB device, rmb or vzalloced sndbuf * must be called under lgr->llc_conf_mutex lock */ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) { if (list_empty(&link->lgr->list)) return -ENOLINK; if (!buf_desc->is_reg_mr[link->link_idx]) { /* register memory region for new buf */ if (buf_desc->is_vm) buf_desc->mr[link->link_idx]->iova = (uintptr_t)buf_desc->cpu_addr; if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { buf_desc->is_reg_err = true; return -EFAULT; } buf_desc->is_reg_mr[link->link_idx] = true; } return 0; } static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); if (rc) goto out; } out: up_write(lock); return rc; } /* map all used buffers of lgr for a new link */ int smcr_buf_map_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; int i, rc = 0; for (i = 0; i < SMC_RMBE_SIZES; i++) { rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, &lgr->rmbs[i], true); if (rc) return rc; rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, &lgr->sndbufs[i], false); if (rc) return rc; } return 0; } /* register all used buffers of lgr for a new link, * must be called under lgr->llc_conf_mutex lock */ int smcr_buf_reg_lgr(struct smc_link *lnk) { struct smc_link_group *lgr = lnk->lgr; struct smc_buf_desc *buf_desc, *bf; int i, rc = 0; /* reg all RMBs for a new link */ down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->rmbs_lock); return rc; } } } up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { up_write(&lgr->sndbufs_lock); return rc; } } } up_write(&lgr->sndbufs_lock); return rc; } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, int bufsize) { struct smc_buf_desc *buf_desc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); switch (lgr->buf_type) { case SMCR_PHYS_CONT_BUFS: case SMCR_MIXED_BUFS: buf_desc->order = get_order(bufsize); buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP | __GFP_NORETRY | __GFP_ZERO, buf_desc->order); if (buf_desc->pages) { buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); buf_desc->len = bufsize; buf_desc->is_vm = false; break; } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; fallthrough; // try virtually contiguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); if (!buf_desc->cpu_addr) goto out; buf_desc->pages = NULL; buf_desc->len = bufsize; buf_desc->is_vm = true; break; } return buf_desc; out: kfree(buf_desc); return ERR_PTR(-EAGAIN); } /* map buf_desc on all usable links, * unused buffers stay mapped as long as the link is up */ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; if (!smc_link_usable(lnk)) continue; if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { rc = -ENOMEM; goto out; } cnt++; } out: up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; } static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); if (is_dmb) { rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); if (rc == -ENOMEM) return ERR_PTR(-EAGAIN); if (rc == -ENOSPC) return ERR_PTR(-ENOSPC); return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); } else { buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!buf_desc->cpu_addr) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->len = bufsize; } return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ bufsize = smc->sk.sk_sndbuf / 2; for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; buf_list = &lgr->sndbufs[bufsize_comp]; } bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else buf_desc = smcr_new_buf_create(lgr, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) { if (!is_dgraded) { is_dgraded = true; SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); } continue; } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); buf_desc->used = 1; down_write(lock); smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc); up_write(lock); break; /* found */ } if (IS_ERR(buf_desc)) return PTR_ERR(buf_desc); if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { smcr_buf_unuse(buf_desc, is_rmb, lgr); return -ENOMEM; } } if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_comp = bufsize_comp; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); if (is_smcd) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { if (!conn->sndbuf_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; if (!conn->rmb_desc->is_dma_need_sync) return; if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); } } /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ if (is_smcd && smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) goto create_rmb; rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); smc_lgr_buf_list_del(smc->conn.lgr, false, smc->conn.sndbuf_desc); up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } return rc; } int smcd_buf_attach(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; struct smcd_dev *smcd = conn->lgr->smcd; u64 peer_token = conn->peer_token; struct smc_buf_desc *buf_desc; int rc; buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return -ENOMEM; /* The ghost sndbuf_desc describes the same memory region as * peer RMB. Its lifecycle is consistent with the connection's * and it will be freed with the connections instead of the * link group. */ rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); if (rc) goto free; smc->sk.sk_sndbuf = buf_desc->len; buf_desc->cpu_addr = (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); buf_desc->len -= sizeof(struct smcd_cdc_msg); conn->sndbuf_desc = buf_desc; conn->sndbuf_desc->used = 1; atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); return 0; free: kfree(buf_desc); return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { if (!test_and_set_bit(i, lgr->rtokens_used_mask)) return i; } return -ENOSPC; } static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, u32 rkey) { int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (test_bit(i, lgr->rtokens_used_mask) && lgr->rtokens[i][lnk_idx].rkey == rkey) return i; } return -ENOENT; } /* set rtoken for a new link to an existing rmb */ void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) { int rtok_idx; rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); if (rtok_idx == -ENOENT) return; lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); } /* set rtoken for a new link whose link_id is given */ void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey) { u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); bool found = false; int link_idx; for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { if (lgr->lnk[link_idx].link_id == link_id) { found = true; break; } } if (!found) return; lgr->rtokens[rtok_idx][link_idx].rkey = rkey; lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; } /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; lgr->rtokens[i][lnk->link_idx].rkey = rkey; lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } /* delete an rtoken from all links */ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { lgr->rtokens[i][j].rkey = 0; lgr->rtokens[i][j].dma_addr = 0; } clear_bit(i, lgr->rtokens_used_mask); return 0; } } return -ENOENT; } /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, clc->r0.rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; } static void smc_core_going_away(void) { struct smc_ib_device *smcibdev; struct smcd_dev *smcd; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { int i; for (i = 0; i < SMC_MAX_PORTS; i++) set_bit(i, smcibdev->ports_going_away); } mutex_unlock(&smc_ib_devices.mutex); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { smcd->going_away = 1; } mutex_unlock(&smcd_dev_list.mutex); } /* Clean up all SMC link groups */ static void smc_lgrs_shutdown(void) { struct smcd_dev *smcd; smc_core_going_away(); smc_smcr_terminate_all(NULL); mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) smc_smcd_terminate_all(smcd); mutex_unlock(&smcd_dev_list.mutex); } static int smc_core_reboot_event(struct notifier_block *this, unsigned long event, void *ptr) { smc_lgrs_shutdown(); smc_ib_unregister_client(); smc_ism_exit(); return 0; } static struct notifier_block smc_reboot_notifier = { .notifier_call = smc_core_reboot_event, }; int __init smc_core_init(void) { return register_reboot_notifier(&smc_reboot_notifier); } /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { unregister_reboot_notifier(&smc_reboot_notifier); smc_lgrs_shutdown(); }
219 215 554 100 26 26 8 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_pprev_rcu - returns the dereferenced pprev of @node. * @node: element of the list. */ #define hlist_nulls_pprev_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)(node)->pprev)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { WRITE_ONCE(n->pprev, &n->next); WRITE_ONCE(n->next, (struct hlist_nulls_node *)NULLS_MARKER(NULL)); } /** * hlist_nulls_replace_rcu - replace an old entry by a new one * @old: the element to be replaced * @new: the new element to insert * * Description: * Replace the old entry with the new one in a RCU-protected hlist_nulls, while * permitting racing traversals. * * The caller must take whatever precautions are necessary (such as holding * appropriate locks) to avoid racing with another list-mutation primitive, such * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same * list. However, it is perfectly legal to run concurrently with the _rcu * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_replace_rcu(struct hlist_nulls_node *old, struct hlist_nulls_node *new) { struct hlist_nulls_node *next = old->next; WRITE_ONCE(new->next, next); WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(hlist_nulls_pprev_rcu(new), new); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, &new->next); } /** * hlist_nulls_replace_init_rcu - replace an old entry by a new one and * initialize the old * @old: the element to be replaced * @new: the new element to insert * * Description: * Replace the old entry with the new one in a RCU-protected hlist_nulls, while * permitting racing traversals, and reinitialize the old entry. * * Note: @old must be hashed. * * The caller must take whatever precautions are necessary (such as holding * appropriate locks) to avoid racing with another list-mutation primitive, such * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same * list. However, it is perfectly legal to run concurrently with the _rcu * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_replace_init_rcu(struct hlist_nulls_node *old, struct hlist_nulls_node *new) { hlist_nulls_replace_rcu(old, new); WRITE_ONCE(old->pprev, NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif
177 55 177 12 3227 18 24 1450 250 1668 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_put_many)
30 396 225 10558 52 14208 1144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <trace/events/net_probe_common.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strscpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
8 2754 333 2449 389 84 329 152 151 5 2216 2207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/udp.h> static struct sk_buff *esp4_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct iphdr *iph = ip_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); int proto = iph->protocol; skb_push(skb, -skb_network_offset(skb)); esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } if (proto == IPPROTO_IPV6) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int err; int alen; int blksize; struct xfrm_offload *xo; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; int encap_type = 0; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if ((!(features & NETIF_F_HW_ESP) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; esp.esph = ip_esp_hdr(skb); if (x->encap) encap_type = x->encap->encap_type; if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) { /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header * points to iphdr->protocol (see xfrm4_tunnel_encap_add()). * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol. * Therefore, the protocol field needs to be corrected. */ ip_hdr(skb)->protocol = IPPROTO_UDP; esph->seq_no = htonl(seq); } ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, .gso_segment = esp4_gso_segment, }, }; static const struct xfrm_type_offload esp_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, .xmit = esp_xmit, .encap = esp4_gso_encap, }; static int __init esp4_offload_init(void) { if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
631 624 625 625 624 196 196 194 759 80 758 758 758 758 757 758 86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include <linux/nmi.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static DEFINE_MUTEX(async_wip_mtx); static int async_error; /** * pm_hibernate_is_recovering - if recovering from hibernate due to error. * * Used to query if dev_pm_ops.thaw() is called for normal hibernation case or * recovering from some error. * * Return: true for error case, false for normal case. */ bool pm_hibernate_is_recovering(void) { return pm_transition.event == PM_EVENT_RECOVER; } EXPORT_SYMBOL_GPL(pm_hibernate_is_recovering); static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; case PM_EVENT_POWEROFF: return "poweroff"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd static bool __read_mostly dpm_watchdog_all_cpu_backtrace; module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644); MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace, "Backtrace all CPUs on DPM watchdog timeout"); /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = timer_container_of(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { unsigned int this_cpu = smp_processor_id(); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); if (dpm_watchdog_all_cpu_backtrace) trigger_allbutcpu_cpu_backtrace(this_cpu); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; timer_delete_sync(timer); timer_destroy_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool __dpm_async(struct device *dev, async_func_t func) { if (dev->power.work_in_progress) return true; if (!is_async(dev)) return false; dev->power.work_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); return false; } static bool dpm_async_fn(struct device *dev, async_func_t func) { guard(mutex)(&async_wip_mtx); return __dpm_async(dev, func); } static int dpm_async_with_cleanup(struct device *dev, void *fn) { guard(mutex)(&async_wip_mtx); if (!__dpm_async(dev, fn)) dev->power.work_in_progress = false; return 0; } static void dpm_async_resume_children(struct device *dev, async_func_t func) { /* * Prevent racing with dpm_clear_async_state() during initial list * walks in dpm_noirq_resume_devices(), dpm_resume_early(), and * dpm_resume(). */ guard(mutex)(&dpm_list_mtx); /* * Start processing "async" children of the device unless it's been * started already for them. */ device_for_each_child(dev, func, dpm_async_with_cleanup); } static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) { struct device_link *link; int idx; dpm_async_resume_children(dev, func); idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); device_links_read_unlock(idx); } static void dpm_clear_async_state(struct device *dev) { reinit_completion(&dev->power.completion); dev->power.work_in_progress = false; } static bool dpm_root_device(struct device *dev) { lockdep_assert_held(&dpm_list_mtx); /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return !dev->parent && list_empty(&dev->links.suppliers); } static void async_resume_noirq(void *data, async_cookie_t cookie); /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) { /* * This means that system suspend has been aborted in the noirq * phase before invoking the noirq suspend callback for the * device, so if device_suspend_late() has left it in suspend, * device_resume_early() should leave it in suspend either in * case the early resume of it depends on the noirq resume that * has not run. */ if (dev_pm_skip_suspend(dev)) dev->power.must_resume = false; goto Out; } if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.smart_suspend flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_smart_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } dpm_async_resume_subordinate(dev, async_resume_noirq); } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_noirq); } while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dpm_async_fn(dev, async_resume_noirq)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } static void async_resume_early(void *data, async_cookie_t cookie); /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (dev->power.syscore) goto Skip; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; pm_runtime_enable(dev); Out: TRACE_RESUME(error); complete_all(&dev->power.completion); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } dpm_async_resume_subordinate(dev, async_resume_early); } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_early); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dpm_async_fn(dev, async_resume_early)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); static void async_resume(void *data, async_cookie_t cookie); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (!dev->power.is_suspended) goto Complete; dev->power.is_suspended = false; if (dev->power.direct_complete) { /* * Allow new children to be added under the device after this * point if it has no PM callbacks. */ if (dev->power.no_pm_callbacks) dev->power.is_prepared = false; /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } dpm_async_resume_subordinate(dev, async_resume); } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dpm_async_fn(dev, async_resume)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: /* If enabling runtime PM for the device is blocked, unblock it. */ pm_runtime_unblock(dev); pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); pm_restore_gfp_mask(); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ static bool dpm_leaf_device(struct device *dev) { struct device *child; lockdep_assert_held(&dpm_list_mtx); child = device_find_any_child(dev); if (child) { put_device(child); return false; } /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return list_empty(&dev->links.consumers); } static bool dpm_async_suspend_parent(struct device *dev, async_func_t func) { guard(mutex)(&dpm_list_mtx); /* * If the device is suspended asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by checking * if the device has been deleted already as the parent cannot be * deleted before it. */ if (!device_pm_initialized(dev)) return false; /* Start processing the device's parent if it is "async". */ if (dev->parent) dpm_async_with_cleanup(dev->parent, func); return true; } static void dpm_async_suspend_superior(struct device *dev, async_func_t func) { struct device_link *link; int idx; if (!dpm_async_suspend_parent(dev, func)) return; idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); device_links_read_unlock(idx); } static void dpm_async_suspend_complete_all(struct list_head *device_list) { struct device *dev; guard(mutex)(&async_wip_mtx); list_for_each_entry_reverse(dev, device_list, power.entry) { /* * In case the device is being waited for and async processing * has not started for it yet, let the waiters make progress. */ if (!dev->power.work_in_progress) complete_all(&dev->power.completion); } } /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } static void async_suspend_noirq(void *data, async_cookie_t cookie); /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Devices must be resumed unless they are explicitly allowed to be left * in suspend, but even in that case skipping the resume of devices that * were in use right before the system suspend (as indicated by their * runtime PM usage counters and child counters) would be suboptimal. */ if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_noirq); } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_noirq); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_late_early_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_late_early_list, &dpm_noirq_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } static void async_suspend_late(void *data, async_cookie_t cookie); /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (pm_wakeup_pending()) { WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.direct_complete) goto Complete; /* * Disable runtime PM for the device without checking if there is a * pending resume request for it. */ __pm_runtime_disable(dev, false); if (dev->power.syscore) goto Skip; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); pm_runtime_enable(dev); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_late); } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_late); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_suspended_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_suspended_list, &dpm_late_early_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } static void async_suspend(void *data, async_cookie_t cookie); /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static void device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); dev->power.is_suspended = true; goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend); } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend); } while (!list_empty(&dpm_prepared_list)) { dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_prepared_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_prepared_list, &dpm_suspended_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } static bool device_prepare_smart_suspend(struct device *dev) { struct device_link *link; bool ret = true; int idx; /* * The "smart suspend" feature is enabled for devices whose drivers ask * for it and for devices without PM callbacks. * * However, if "smart suspend" is not enabled for the device's parent * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ if (!dev->power.no_pm_callbacks && !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) return false; if (dev->parent && !dev_pm_smart_suspend(dev->parent) && !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) return false; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; if (!dev_pm_smart_suspend(link->supplier) && !pm_runtime_blocked(link->supplier)) { ret = false; break; } } device_links_read_unlock(idx); return ret; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; bool smart_suspend; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); /* * If runtime PM is disabled for the device at this point and it has * never been enabled so far, it should not be enabled until this system * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ smart_suspend = !pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* Do not enable "smart suspend" for devices with disabled runtime PM. */ if (smart_suspend) smart_suspend = device_prepare_smart_suspend(dev); spin_lock_irq(&dev->power.lock); dev->power.smart_suspend = smart_suspend; /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else { pm_restrict_gfp_mask(); error = dpm_suspend(state); } dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev); }
276 18997 4829 4828 276 16 276 182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_XSTATE_H #define __X86_KERNEL_FPU_XSTATE_H #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> #include <asm/msr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); #endif static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) { /* * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } static inline u64 xstate_get_group_perm(bool guest) { struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ perm = guest ? &fpu->guest_perm : &fpu->perm; return READ_ONCE(perm->__state_perm); } static inline u64 xstate_get_host_group_perm(void) { return xstate_get_group_perm(false); } enum xstate_copy_mode { XSTATE_COPY_FP, XSTATE_COPY_FX, XSTATE_COPY_XSAVE, }; struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; return fpu_kernel_cfg.independent_features; } static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask) { u64 xfeatures; int err; /* Read the xfeatures value already saved in the user buffer */ err = __get_user(xfeatures, &xbuf->header.xfeatures); xfeatures |= mask; err |= __put_user(xfeatures, &xbuf->header.xfeatures); return err; } /* * Update the value of PKRU register that was already pushed onto the signal frame. */ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) { int err; if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) return 0; /* Mark PKRU as in-use so that it is restored correctly. */ err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU); if (err) return err; /* Update PKRU value in the userspace xsave buffer. */ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); } /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 #define REX_SUFFIX "64" #else #define REX_SUFFIX #endif #define XSAVE "xsave" REX_SUFFIX " %[xa]" #define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]" #define XSAVEC "xsavec" REX_SUFFIX " %[xa]" #define XSAVES "xsaves" REX_SUFFIX " %[xa]" #define XRSTOR "xrstor" REX_SUFFIX " %[xa]" #define XRSTORS "xrstors" REX_SUFFIX " %[xa]" /* * After this @err contains 0 on success or the trap number when the * operation raises an exception. * * The [xa] input parameter below represents the struct xregs_state pointer * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns * above. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor * states in addition to XSAVEC. * * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * * Use XSAVE as a fallback. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ asm volatile("1: " ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n\t" \ "xor %[err], %[err]\n" \ "3:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile("1: " ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") #if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); #else static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } #endif #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { wrmsrq(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; if (__this_cpu_read(xfd_state) != xfd) xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else static inline void xfd_set_state(u64 xfd) { } static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { return -EPERM; } #endif /* * Save processor xstate to xsave area. * * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features * and command line options. The choice is permanent until the next reboot. */ static inline void os_xsave(struct fpstate *fpstate) { u64 mask = fpstate->xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); xfd_validate_state(fpstate, mask, false); XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. * * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. */ static inline void os_xrstor(struct fpstate *fpstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; xfd_validate_state(fpstate, mask, true); XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* Restore of supervisor state. Does not require XFD */ static inline void os_xrstor_supervisor(struct fpstate *fpstate) { u64 mask = xfeatures_mask_supervisor(); u32 lmask = mask; u32 hmask = mask >> 32; XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* * XSAVE itself always writes all requested xfeatures. Removing features * from the request bitmap reduces the features which are written. * Generate a mask of features which must be written to a sigframe. The * unset features can be optimized away and not written. * * This optimization is user-visible. Only use for states where * uninitialized sigframe contents are tolerable, like dynamic features. * * Users of buffers produced with this optimization must check XSTATE_BV * to determine which features have been optimized out. */ static inline u64 xfeatures_need_sigframe_write(void) { u64 xfeaures_to_write; /* In-use features must be written: */ xfeaures_to_write = xfeatures_in_use(); /* Also write all non-optimizable sigframe features: */ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_SIGFRAME_INITOPT; return xfeaures_to_write; } /* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track * a different application. * * We don't use compacted format xsave area for backward compatibility for * old applications which don't understand the compacted format of the * xsave area. * * The caller has to zero buf::header before calling this because XSAVE* * does not touch the reserved fields in the header. */ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru) { /* * Include the features which are not xsaved/rstored by the kernel * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ struct fpstate *fpstate = x86_task_fpu(current)->fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; int err; /* Optimize away writing unnecessary xfeatures: */ if (fpu_state_size_dynamic()) mask &= xfeatures_need_sigframe_write(); lmask = mask; hmask = mask >> 32; xfd_validate_state(fpstate, mask, false); stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); if (!err) err = update_pkru_in_sigframe(buf, pkru); return err; } /* * Restore xstate from user space xsave area. */ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; int err; xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); return err; } /* * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) { struct xregs_state *xstate = &fpstate->regs.xsave; u32 lmask = mask; u32 hmask = mask >> 32; int err; /* Ensure that XFD is up to date */ xfd_update_state(fpstate); if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } #endif
5 13 5 13 7 13 26 11 128 152 13 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash: Hash algorithms under the crypto API * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_HASH_H #define _CRYPTO_HASH_H #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/string.h> /* Set this bit for virtual address instead of SG list. */ #define CRYPTO_AHASH_REQ_VIRT 0x00000001 #define CRYPTO_AHASH_REQ_PRIVATE \ CRYPTO_AHASH_REQ_VIRT struct crypto_ahash; /** * DOC: Message Digest Algorithm Definitions * * These data structures define modular message digest algorithm * implementations, managed via crypto_register_ahash(), * crypto_register_shash(), crypto_unregister_ahash() and * crypto_unregister_shash(). */ /* * struct hash_alg_common - define properties of message digest * @digestsize: Size of the result of the transformation. A buffer of this size * must be available to the @final and @finup calls, so they can * store the resulting hash into it. For various predefined sizes, * search include/crypto/ using * git grep _DIGEST_SIZE include/crypto. * @statesize: Size of the block for partial state of the transformation. A * buffer of this size must be passed to the @export function as it * will save the partial state of the transformation into it. On the * other side, the @import function will load the state from a * buffer of this size as well. * @base: Start of data structure of cipher algorithm. The common data * structure of crypto_alg contains information common to all ciphers. * The hash_alg_common data structure now adds the hash-specific * information. */ #define HASH_ALG_COMMON { \ unsigned int digestsize; \ unsigned int statesize; \ \ struct crypto_alg base; \ } struct hash_alg_common HASH_ALG_COMMON; struct ahash_request { struct crypto_async_request base; unsigned int nbytes; union { struct scatterlist *src; const u8 *svirt; }; u8 *result; struct scatterlist sg_head[2]; crypto_completion_t saved_complete; void *saved_data; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct ahash_alg - asynchronous message digest definition * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the * state of the HASH transformation at the beginning. This shall fill in * the internal structures used during the entire duration of the whole * transformation. No data processing happens at this point. Driver code * implementation must not use req->result. * @update: **[mandatory]** Push a chunk of data into the driver for transformation. This * function actually pushes blocks of data from upper layers into the * driver, which then passes those to the hardware as seen fit. This * function must not finalize the HASH transformation by calculating the * final message digest as this only adds more data into the * transformation. This function shall not modify the transformation * context, as this function may be called in parallel with the same * transformation object. Data processing can happen synchronously * [SHASH] or asynchronously [AHASH] at this point. Driver must not use * req->result. * For block-only algorithms, @update must return the number * of bytes to store in the API partial block buffer. * @final: **[mandatory]** Retrieve result from the driver. This function finalizes the * transformation and retrieves the resulting hash from the driver and * pushes it back to upper layers. No data processing happens at this * point unless hardware requires it to finish the transformation * (then the data buffered by the device driver is processed). * @finup: **[optional]** Combination of @update and @final. This function is effectively a * combination of @update and @final calls issued in sequence. As some * hardware cannot do @update and @final separately, this callback was * added to allow such hardware to be used at least by IPsec. Data * processing can happen synchronously [SHASH] or asynchronously [AHASH] * at this point. * @digest: Combination of @init and @update and @final. This function * effectively behaves as the entire chain of operations, @init, * @update and @final issued in sequence. Just like @finup, this was * added for hardware which cannot do even the @finup, but can only do * the whole transformation in one run. Data processing can happen * synchronously [SHASH] or asynchronously [AHASH] at this point. * @setkey: Set optional key used by the hashing algorithm. Intended to push * optional key used by the hashing algorithm from upper layers into * the driver. This function can store the key in the transformation * context or can outright program it into the hardware. In the former * case, one must be careful to program the key into the hardware at * appropriate time and one must be careful that .setkey() can be * called multiple times during the existence of the transformation * object. Not all hashing algorithms do implement this function as it * is only needed for keyed message digests. SHAx/MDx/CRCx do NOT * implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement * this function. This function must be called before any other of the * @init, @update, @final, @finup, @digest is called. No data * processing happens at this point. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. Driver must not use req->result. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. Driver must not use * req->result. * @export_core: Export partial state without partial block. Only defined * for algorithms that are not block-only. * @import_core: Import partial state without partial block. Only defined * for algorithms that are not block-only. * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @clone_tfm: Copy transform into new object, may allocate memory. * @halg: see struct hash_alg_common */ struct ahash_alg { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*export_core)(struct ahash_request *req, void *out); int (*import_core)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_ahash *tfm); void (*exit_tfm)(struct crypto_ahash *tfm); int (*clone_tfm)(struct crypto_ahash *dst, struct crypto_ahash *src); struct hash_alg_common halg; }; struct shash_desc { struct crypto_shash *tfm; void *__ctx[] __aligned(ARCH_SLAB_MINALIGN); }; #define HASH_MAX_DIGESTSIZE 64 /* * The size of a core hash state and a partial block. The final byte * is the length of the partial block. */ #define HASH_STATE_AND_BLOCK(state, block) ((state) + (block) + 1) /* Worst case is sha3-224. */ #define HASH_MAX_STATESIZE HASH_STATE_AND_BLOCK(200, 144) /* This needs to match arch/s390/crypto/sha.h. */ #define S390_SHA_CTX_SIZE 216 /* * Worst case is hmac(sha3-224-s390). Its context is a nested 'shash_desc' * containing a 'struct s390_sha_ctx'. */ #define SHA3_224_S390_DESCSIZE HASH_STATE_AND_BLOCK(S390_SHA_CTX_SIZE, 144) #define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + \ SHA3_224_S390_DESCSIZE) #define MAX_SYNC_HASH_REQSIZE (sizeof(struct ahash_request) + \ HASH_MAX_DESCSIZE) #define SHASH_DESC_ON_STACK(shash, ctx) \ char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \ __aligned(__alignof__(struct shash_desc)); \ struct shash_desc *shash = (struct shash_desc *)__##shash##_desc #define HASH_REQUEST_ON_STACK(name, _tfm) \ char __##name##_req[sizeof(struct ahash_request) + \ MAX_SYNC_HASH_REQSIZE] CRYPTO_MINALIGN_ATTR; \ struct ahash_request *name = \ ahash_request_on_stack_init(__##name##_req, (_tfm)) #define HASH_REQUEST_CLONE(name, gfp) \ hash_request_clone(name, sizeof(__##name##_req), gfp) #define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1) /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg * @update: see struct ahash_alg * @final: see struct ahash_alg * @finup: see struct ahash_alg * @digest: see struct ahash_alg * @export: see struct ahash_alg * @import: see struct ahash_alg * @export_core: see struct ahash_alg * @import_core: see struct ahash_alg * @setkey: see struct ahash_alg * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @clone_tfm: Copy transform into new object, may allocate memory. * @descsize: Size of the operational state for the message digest. This state * size is the memory size that needs to be allocated for * shash_desc.__ctx * @halg: see struct hash_alg_common * @HASH_ALG_COMMON: see struct hash_alg_common */ struct shash_alg { int (*init)(struct shash_desc *desc); int (*update)(struct shash_desc *desc, const u8 *data, unsigned int len); int (*final)(struct shash_desc *desc, u8 *out); int (*finup)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*digest)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*export)(struct shash_desc *desc, void *out); int (*import)(struct shash_desc *desc, const void *in); int (*export_core)(struct shash_desc *desc, void *out); int (*import_core)(struct shash_desc *desc, const void *in); int (*setkey)(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_shash *tfm); void (*exit_tfm)(struct crypto_shash *tfm); int (*clone_tfm)(struct crypto_shash *dst, struct crypto_shash *src); unsigned int descsize; union { struct HASH_ALG_COMMON; struct hash_alg_common halg; }; }; #undef HASH_ALG_COMMON struct crypto_ahash { bool using_shash; /* Underlying algorithm is shash, not ahash */ unsigned int statesize; unsigned int reqsize; struct crypto_tfm base; }; struct crypto_shash { struct crypto_tfm base; }; /** * DOC: Asynchronous Message Digest API * * The asynchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto) * * The asynchronous cipher operation discussion provided for the * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well. */ static inline bool ahash_req_on_stack(struct ahash_request *req) { return crypto_req_on_stack(&req->base); } static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_ahash, base); } /** * crypto_alloc_ahash() - allocate ahash cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an ahash. The returned struct * crypto_ahash is the cipher handle that is required for any subsequent * API invocation for that ahash. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask); struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *tfm); static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) { return &tfm->base; } /** * crypto_free_ahash() - zeroize and free the ahash handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_ahash(struct crypto_ahash *tfm) { crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm)); } /** * crypto_has_ahash() - Search for the availability of an ahash. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash * @type: specifies the type of the ahash * @mask: specifies the mask for the ahash * * Return: true when the ahash is known to the kernel crypto API; false * otherwise */ int crypto_has_ahash(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_name(crypto_ahash_tfm(tfm)); } static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); } /** * crypto_ahash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm) { return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); } static inline struct hash_alg_common *__crypto_hash_alg_common( struct crypto_alg *alg) { return container_of(alg, struct hash_alg_common, base); } static inline struct hash_alg_common *crypto_hash_alg_common( struct crypto_ahash *tfm) { return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg); } /** * crypto_ahash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * * Return: message digest size of cipher */ static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->digestsize; } /** * crypto_ahash_statesize() - obtain size of the ahash state * @tfm: cipher handle * * Return the size of the ahash state. With the crypto_ahash_export() * function, the caller can export the state into a buffer whose size is * defined with this function. * * Return: size of the ahash state */ static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm) { return tfm->statesize; } static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) { return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); } static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); } static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); } /** * crypto_ahash_reqtfm() - obtain cipher handle from request * @req: asynchronous request handle that contains the reference to the ahash * cipher handle * * Return the ahash cipher handle that is registered with the asynchronous * request handle ahash_request. * * Return: ahash cipher handle */ static inline struct crypto_ahash *crypto_ahash_reqtfm( struct ahash_request *req) { return __crypto_ahash_cast(req->base.tfm); } /** * crypto_ahash_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: size of the request data */ static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) { return tfm->reqsize; } static inline void *ahash_request_ctx(struct ahash_request *req) { return req->__ctx; } /** * crypto_ahash_setkey - set key for cipher handle * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the ahash cipher. The cipher * handle must point to a keyed hash in order for this function to succeed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_finup(struct ahash_request *req); /** * crypto_ahash_final() - calculate message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer registered with the ahash_request handle. * * Return: * 0 if the message digest was successfully calculated; * -EINPROGRESS if data is fed into hardware (DMA) or queued for later; * -EBUSY if queue is full and request should be resubmitted later; * other < 0 if an error occurred */ static inline int crypto_ahash_final(struct ahash_request *req) { req->nbytes = 0; return crypto_ahash_finup(req); } /** * crypto_ahash_digest() - calculate message digest for a buffer * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of crypto_ahash_init, * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_digest(struct ahash_request *req); /** * crypto_ahash_export() - extract current message digest state * @req: reference to the ahash_request handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the ahash_request handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_ahash_statesize()). * * Return: 0 if the export was successful; < 0 if an error occurred */ int crypto_ahash_export(struct ahash_request *req, void *out); /** * crypto_ahash_import() - import message digest state * @req: reference to ahash_request handle the state is imported into * @in: buffer holding the state * * This function imports the hash state into the ahash_request handle from the * input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_ahash_import(struct ahash_request *req, const void *in); /** * crypto_ahash_init() - (re)initialize message digest handle * @req: ahash_request handle that already is initialized with all necessary * data using the ahash_request_* API functions * * The call (re-)initializes the message digest referenced by the ahash_request * handle. Any potentially existing state created by previous operations is * discarded. * * Return: see crypto_ahash_final() */ int crypto_ahash_init(struct ahash_request *req); /** * crypto_ahash_update() - add data to message digest for processing * @req: ahash_request handle that was previously initialized with the * crypto_ahash_init call. * * Updates the message digest state of the &ahash_request handle. The input data * is pointed to by the scatter/gather list registered in the &ahash_request * handle * * Return: see crypto_ahash_final() */ int crypto_ahash_update(struct ahash_request *req); /** * DOC: Asynchronous Hash Request Handle * * The &ahash_request data structure contains all pointers to data * required for the asynchronous cipher operation. This includes the cipher * handle (which can be used by multiple &ahash_request instances), pointer * to plaintext and the message digest output buffer, asynchronous callback * function, etc. It acts as a handle to the ahash_request_* API calls in a * similar way as ahash handle to the crypto_ahash_* API calls. */ /** * ahash_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing ahash handle in the request * data structure with a different one. */ static inline void ahash_request_set_tfm(struct ahash_request *req, struct crypto_ahash *tfm) { crypto_request_set_tfm(&req->base, crypto_ahash_tfm(tfm)); } /** * ahash_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the ahash * message digest API calls. During * the allocation, the provided ahash handle * is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct ahash_request *ahash_request_alloc_noprof( struct crypto_ahash *tfm, gfp_t gfp) { struct ahash_request *req; req = kmalloc_noprof(sizeof(struct ahash_request) + crypto_ahash_reqsize(tfm), gfp); if (likely(req)) ahash_request_set_tfm(req, tfm); return req; } #define ahash_request_alloc(...) alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__)) /** * ahash_request_free() - zeroize and free the request data structure * @req: request data structure cipher handle to be freed */ void ahash_request_free(struct ahash_request *req); static inline void ahash_request_zero(struct ahash_request *req) { memzero_explicit(req, sizeof(*req) + crypto_ahash_reqsize(crypto_ahash_reqtfm(req))); } static inline struct ahash_request *ahash_request_cast( struct crypto_async_request *req) { return container_of(req, struct ahash_request, base); } /** * ahash_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * &crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once * the cipher operation completes. * * The callback function is registered with the &ahash_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void ahash_request_set_callback(struct ahash_request *req, u32 flags, crypto_completion_t compl, void *data) { flags &= ~CRYPTO_AHASH_REQ_PRIVATE; flags |= req->base.flags & CRYPTO_AHASH_REQ_PRIVATE; crypto_request_set_callback(&req->base, flags, compl, data); } /** * ahash_request_set_crypt() - set data buffers * @req: ahash_request handle to be updated * @src: source scatter/gather list * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source scatter/gather list * * By using this call, the caller references the source scatter/gather list. * The source scatter/gather list points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_crypt(struct ahash_request *req, struct scatterlist *src, u8 *result, unsigned int nbytes) { req->src = src; req->nbytes = nbytes; req->result = result; req->base.flags &= ~CRYPTO_AHASH_REQ_VIRT; } /** * ahash_request_set_virt() - set virtual address data buffers * @req: ahash_request handle to be updated * @src: source virtual address * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source virtual address * * By using this call, the caller references the source virtual address. * The source virtual address points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_virt(struct ahash_request *req, const u8 *src, u8 *result, unsigned int nbytes) { req->svirt = src; req->nbytes = nbytes; req->result = result; req->base.flags |= CRYPTO_AHASH_REQ_VIRT; } /** * DOC: Synchronous Message Digest API * * The synchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto) * * The message digest API is able to maintain state information for the * caller. * * The synchronous message digest API can store user-related context in its * shash_desc request data structure. */ /** * crypto_alloc_shash() - allocate message digest handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a message digest. The returned &struct * crypto_shash is the cipher handle that is required for any subsequent * API invocation for that message digest. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask); struct crypto_shash *crypto_clone_shash(struct crypto_shash *tfm); int crypto_has_shash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) { return &tfm->base; } /** * crypto_free_shash() - zeroize and free the message digest handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_shash(struct crypto_shash *tfm) { crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm) { return crypto_tfm_alg_name(crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm) { return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); } /** * crypto_shash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm) { return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm)); } static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg) { return container_of(alg, struct shash_alg, base); } static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm) { return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg); } /** * crypto_shash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * Return: digest size of cipher */ static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->digestsize; } static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->statesize; } static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm) { return crypto_tfm_get_flags(crypto_shash_tfm(tfm)); } static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags); } static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags); } /** * crypto_shash_descsize() - obtain the operational state size * @tfm: cipher handle * * The size of the operational state the cipher needs during operation is * returned for the hash referenced with the cipher handle. This size is * required to calculate the memory requirements to allow the caller allocating * sufficient memory for operational state. * * The operational state is defined with struct shash_desc where the size of * that data structure is to be calculated as * sizeof(struct shash_desc) + crypto_shash_descsize(alg) * * Return: size of the operational state */ static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->descsize; } static inline void *shash_desc_ctx(struct shash_desc *desc) { return desc->__ctx; } /** * crypto_shash_setkey() - set key for message digest * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the keyed message digest cipher. The * cipher handle must point to a keyed message digest cipher in order for this * function to succeed. * * Context: Softirq or process context. * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); /** * crypto_shash_digest() - calculate message digest for buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of crypto_shash_init, * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Context: Softirq or process context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_tfm_digest() - calculate message digest for buffer * @tfm: hash transformation object * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This is a simplified version of crypto_shash_digest() for users who don't * want to allocate their own hash descriptor (shash_desc). Instead, * crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash) * directly, and it allocates a hash descriptor on the stack internally. * Note that this stack allocation may be fairly large. * * Context: Softirq or process context. * Return: 0 on success; < 0 if an error occurred. */ int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out); int crypto_hash_digest(struct crypto_ahash *tfm, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_export() - extract operational state for message digest * @desc: reference to the operational state handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the operational state handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_shash_descsize). * * Context: Softirq or process context. * Return: 0 if the export creation was successful; < 0 if an error occurred */ int crypto_shash_export(struct shash_desc *desc, void *out); /** * crypto_shash_import() - import operational state * @desc: reference to the operational state handle the state imported into * @in: buffer holding the state * * This function imports the hash state into the operational state handle from * the input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Context: Softirq or process context. * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_shash_import(struct shash_desc *desc, const void *in); /** * crypto_shash_init() - (re)initialize message digest * @desc: operational state handle that is already filled * * The call (re-)initializes the message digest referenced by the * operational state handle. Any potentially existing state created by * previous operations is discarded. * * Context: Softirq or process context. * Return: 0 if the message digest initialization was successful; < 0 if an * error occurred */ int crypto_shash_init(struct shash_desc *desc); /** * crypto_shash_finup() - calculate message digest of buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate functions. * * Context: Softirq or process context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_update() - add data to message digest for processing * @desc: operational state handle that is already initialized * @data: input data to be added to the message digest * @len: length of the input data * * Updates the message digest state of the operational state handle. * * Context: Softirq or process context. * Return: 0 if the message digest update was successful; < 0 if an error * occurred */ static inline int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return crypto_shash_finup(desc, data, len, NULL); } /** * crypto_shash_final() - calculate message digest * @desc: operational state handle that is already filled with data * @out: output buffer filled with the message digest * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer. The caller must ensure that the output buffer is * large enough by using crypto_shash_digestsize. * * Context: Softirq or process context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ static inline int crypto_shash_final(struct shash_desc *desc, u8 *out) { return crypto_shash_finup(desc, NULL, 0, out); } static inline void shash_desc_zero(struct shash_desc *desc) { memzero_explicit(desc, sizeof(*desc) + crypto_shash_descsize(desc->tfm)); } static inline bool ahash_is_async(struct crypto_ahash *tfm) { return crypto_tfm_is_async(&tfm->base); } static inline struct ahash_request *ahash_request_on_stack_init( char *buf, struct crypto_ahash *tfm) { struct ahash_request *req = (void *)buf; crypto_stack_request_init(&req->base, crypto_ahash_tfm(tfm)); return req; } static inline struct ahash_request *ahash_request_clone( struct ahash_request *req, size_t total, gfp_t gfp) { return container_of(crypto_request_clone(&req->base, total, gfp), struct ahash_request, base); } #endif /* _CRYPTO_HASH_H */
6 2221 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) /* Cast an error pointer to __iomem. */ #define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error) /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /* Read an error pointer from the percpu address space. */ #define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /* Read an error pointer from the percpu address space. */ #define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
4375 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha256_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha256_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { static_call(sha256_blocks_x86)(state, data, nblocks); } static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, int len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]); #define sha256_finup_2x_arch sha256_finup_2x_arch static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { /* * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. * Further limit len to 65536 to avoid spending too long with preemption * disabled. (Of course, in practice len is nearly always 4096 anyway.) */ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && len <= 65536 && likely(irq_fpu_usable())) { kernel_fpu_begin(); sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); kernel_fpu_end(); kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); return true; } return false; } static bool sha256_finup_2x_is_optimized_arch(void) { return static_key_enabled(&have_sha_ni); } #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, sha256_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } }
86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (call->mdev_us >> 2); /* similar update on mdev */ } call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (call->mdev_us > call->mdev_max_us) { call->mdev_max_us = call->mdev_us; if (call->mdev_max_us > call->rttvar_us) call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ call->mdev_us = m << 1; /* make sure rto = 3*rtt */ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); call->mdev_max_us = call->rttvar_us; } call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ call->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, (u32)rtt_us ? : jiffies_to_usecs(1)); } static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ rxrpc_update_rtt_min(call, resp_time, rtt_us); rxrpc_rtt_estimator(call, rtt_us); rxrpc_set_rto(call); /* Only reset backoff on valid RTT measurement [RFC6298]. */ call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the call RTT data. */ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; rxrpc_ack_update_rtt(call, resp_time, rtt_us); if (call->rtt_count < 3) call->rtt_count++; call->rtt_taken++; WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; u32 backoff = READ_ONCE(call->backoff); timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_call_init_rtt(struct rxrpc_call *call) { call->rtt_last_req = KTIME_MIN; call->rto_us = RXRPC_TIMEOUT_INIT; call->mdev_us = RXRPC_TIMEOUT_INIT; call->backoff = 0; //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); }
2 2 4 4 1 1 3 2 1 1 5 5 5 1 5 5 1 1 2 2 5 5 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 // SPDX-License-Identifier: GPL-2.0-only /* * Syscall interface to knfsd. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/fs_context.h> #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/svc.h> #include <linux/module.h> #include <linux/fsnotify.h> #include <linux/nfslocalio.h> #include "idmap.h" #include "nfsd.h" #include "cache.h" #include "state.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #include "netlink.h" /* * We have a single directory with several nodes in it. */ enum { NFSD_Root = 1, NFSD_List, NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, NFSD_Pool_Stats, NFSD_Reply_Cache_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, NFSD_Filecache, NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, NFSD_V4EndGrace, NFSD_MaxReserved }; /* * write() for these nodes. */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif static ssize_t (*const write_op[])(struct file *, char *, size_t) = { [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, [NFSD_Threads] = write_threads, [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, [NFSD_Ports] = write_ports, [NFSD_MaxBlkSize] = write_maxblksize, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = write_recoverydir, #endif [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { ino_t ino = file_inode(file)->i_ino; char *data; ssize_t rv; if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) return -EINVAL; data = simple_transaction_get(file, buf, size); if (IS_ERR(data)) return PTR_ERR(data); rv = write_op[ino](file, data, size); if (rv < 0) return rv; simple_transaction_set(file, rv); return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return * state information */ ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos); if (rv < 0) return rv; } return simple_transaction_read(file, buf, size, pos); } static const struct file_operations transaction_ops = { .write = nfsctl_transaction_write, .read = nfsctl_transaction_read, .release = simple_transaction_release, .llseek = default_llseek, }; static int exports_net_open(struct net *net, struct file *file) { int err; struct seq_file *seq; struct nfsd_net *nn = net_generic(net, nfsd_net_id); err = seq_open(file, &nfs_exports_op); if (err) return err; seq = file->private_data; seq->private = nn->svc_export_cache; return 0; } static int exports_nfsd_open(struct inode *inode, struct file *file) { return exports_net_open(inode->i_sb->s_fs_info, file); } static const struct file_operations exports_nfsd_operations = { .open = exports_nfsd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int export_features_show(struct seq_file *m, void *v) { seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); return 0; } DEFINE_SHOW_ATTRIBUTE(export_features); static int nfsd_pool_stats_open(struct inode *inode, struct file *file) { struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); return svc_pool_stats_open(&nn->nfsd_info, file); } static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats); /*----------------------------------------------------------------------------*/ /* * payload - write methods */ static inline struct net *netns(struct file *file) { return file_inode(file)->i_sb->s_fs_info; } /* * write_unlock_ip - Release all locks used by a client * * Experimental. * * Input: * buf: '\n'-terminated C string containing a * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; size_t salen = sizeof(address); char *fo_path; struct net *net = netns(file); /* sanity check */ if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } /* * write_unlock_fs - Release all locks on a local file system * * Experimental. * * Input: * buf: '\n'-terminated C string containing the * absolute pathname of a local file system * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value */ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) { struct path path; char *fo_path; int error; /* sanity check */ if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; /* * XXX: Needs better sanity checking. Otherwise we could end up * releasing locks on the wrong file system. * * For example: * 1. Does the path refer to a directory? * 2. Is that directory a mount point, or * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; } /* * write_filehandle - Get a variable-length NFS file handle by path * * On input, the buffer contains a '\n'-terminated C string comprised of * three alphanumeric words separated by whitespace. The string may * contain escape sequences. * * Input: * buf: * domain: client domain name * path: export pathname * maxsize: numeric maximum size of * @buf * size: length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing a ASCII hex text version * of the NFS file handle; * return code is the size in bytes of the string * On error: return code is negative errno value */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { char *dname, *path; int maxsize; char *mesg = buf; int len; struct auth_domain *dom; struct knfsd_fh fh; if (size == 0) return -EINVAL; if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; dname = mesg; len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) return -EINVAL; len = get_int(&mesg, &maxsize); if (len) return len; if (maxsize < NFS_FHSIZE) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size); mesg[-1] = '\n'; return mesg - buf; } /* * write_threads - Start NFSD, or report the current number of running threads * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with '\n'-terminated C * string numeric value representing the number of * running NFSD threads; * return code is the size in bytes of the string * On error: return code is zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the * number of NFSD threads to start * size: non-zero length of C string in @buf * Output: * On success: NFS service is started; * passed-in buffer filled with '\n'-terminated C * string numeric value representing the number of * running NFSD threads; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; struct net *net = netns(file); if (size > 0) { int newthreads; rv = get_int(&mesg, &newthreads); if (rv) return rv; if (newthreads < 0) return -EINVAL; trace_nfsd_ctl_threads(net, newthreads); mutex_lock(&nfsd_mutex); rv = nfsd_svc(1, &newthreads, net, file->f_cred, NULL); mutex_unlock(&nfsd_mutex); if (rv < 0) return rv; } else rv = nfsd_nrthreads(net); return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); } /* * write_pool_threads - Set or report the current number of threads per pool * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing whitespace- * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing integer values representing the * number of NFSD threads in each pool; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for an array of number of threads per node * and apply them then write out number of threads per node as reply */ char *mesg = buf; int i; int rv; int len; int npools; int *nthreads; struct net *net = netns(file); mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(net); if (npools == 0) { /* * NFS is shut down. The admin can start it by * writing to the threads file but NOT the pool_threads * file, sorry. Report zero threads. */ mutex_unlock(&nfsd_mutex); strcpy(buf, "0\n"); return strlen(buf); } nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); rv = -ENOMEM; if (nthreads == NULL) goto out_free; if (size > 0) { for (i = 0; i < npools; i++) { rv = get_int(&mesg, &nthreads[i]); if (rv == -ENOENT) break; /* fewer numbers than pools */ if (rv) goto out_free; /* syntax error */ rv = -EINVAL; if (nthreads[i] < 0) goto out_free; trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } /* * There must always be a thread in pool 0; the admin * can't shut down NFS completely using pool_threads. */ if (nthreads[0] == 0) nthreads[0] = 1; rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) goto out_free; } rv = nfsd_get_nrthreads(npools, nthreads, net); if (rv) goto out_free; mesg = buf; size = SIMPLE_TRANSACTION_LIMIT; for (i = 0; i < npools && size > 0; i++) { snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); len = strlen(mesg); size -= len; mesg += len; } rv = mesg - buf; out_free: kfree(nthreads); mutex_unlock(&nfsd_mutex); return rv; } static ssize_t nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining, const char *sep, unsigned vers, int minor) { const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u"; bool supported = !!nfsd_vers(nn, vers, NFSD_TEST); if (vers == 4 && minor >= 0 && !nfsd_minorversion(nn, minor, NFSD_TEST)) supported = false; if (minor == 0 && supported) /* * special case for backward compatability. * +4.0 is never reported, it is implied by * +4, unless -4.0 is present. */ return 0; return snprintf(buf, remaining, format, sep, supported ? '+' : '-', vers, minor); } static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; char *vers, *minorp, sign; int len, num, remaining; ssize_t tlen = 0; char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing * rq_argp and rq_resp */ return -EBUSY; if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); if (len <= 0) return -EINVAL; do { enum vers_op cmd; unsigned minor; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); else num = simple_strtol(vers, &minorp, 0); if (*minorp == '.') { if (num != 4) return -EINVAL; if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; } cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { #ifdef CONFIG_NFSD_V2 case 2: #endif case 3: nfsd_vers(nn, num, cmd); break; case 4: if (*minorp == '.') { if (nfsd_minorversion(nn, minor, cmd) < 0) return -EINVAL; } else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) { /* * Either we have +4 and no minors are enabled, * or we have -4 and at least one minor is enabled. * In either case, propagate 'cmd' to all minors. */ minor = 0; while (nfsd_minorversion(nn, minor, cmd) >= 0) minor++; } break; default: /* Ignore requests to disable non-existent versions */ if (cmd == NFSD_SET) return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as * having no versions is BAD */ nfsd_reset_versions(nn); } /* Now write current state into reply buffer */ sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; for (num=2 ; num <= 4 ; num++) { int minor; if (!nfsd_vers(nn, num, NFSD_AVAIL)) continue; minor = -1; do { len = nfsd_print_version_support(nn, buf, remaining, sep, num, minor); if (len >= remaining) goto out; remaining -= len; buf += len; tlen += len; minor++; if (len) sep = " "; } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); } out: len = snprintf(buf, remaining, "\n"); if (len >= remaining) return -EINVAL; return tlen + len; } /* * write_versions - Set or report the available NFS protocol versions * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing positive or negative integer * values representing the current status of each * protocol version; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value * * OR * * Input: * buf: C string containing whitespace- * separated positive or negative * integer values representing NFS * protocol versions to enable ("+n") * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has * been updated; passed-in buffer filled with * '\n'-terminated C string containing positive * or negative integer values representing the * current status of each protocol version; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_versions(struct file *file, char *buf, size_t size) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __write_versions(file, buf, size); mutex_unlock(&nfsd_mutex); return rv; } /* * Zero-length write. Return a list of NFSD's current listener * transports. */ static ssize_t __write_ports_names(char *buf, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); } /* * A single 'fd' number was written, in which case it must be for * a socket of a supported family/protocol, and we use it as an * nfsd listener. */ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred) { char *mesg = buf; int fd, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) return err; serv = nn->nfsd_serv; err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); return err; } /* * A transport listener is added by writing its transport name and * a port number. */ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) { char transport[16]; struct svc_xprt *xprt; int port, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; if (sscanf(buf, "%15s %5u", transport, &port) != 2) return -EINVAL; if (port < 1 || port > USHRT_MAX) return -EINVAL; trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) return err; serv = nn->nfsd_serv; err = svc_xprt_create(serv, transport, net, PF_INET, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0) goto out_err; err = svc_xprt_create(serv, transport, net, PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; return 0; out_close: xprt = svc_find_xprt(serv, transport, net, PF_INET, port); if (xprt != NULL) { svc_xprt_close(xprt); svc_xprt_put(xprt); } out_err: if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); return err; } static ssize_t __write_ports(struct file *file, char *buf, size_t size, struct net *net) { if (size == 0) return __write_ports_names(buf, net); if (isdigit(buf[0])) return __write_ports_addfd(buf, net, file->f_cred); if (isalpha(buf[0])) return __write_ports_addxprt(buf, net, file->f_cred); return -EINVAL; } /* * write_ports - Pass a socket file descriptor or transport name to listen on * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with a '\n'-terminated C * string containing a whitespace-separated list of * named NFSD listeners; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value * * OR * * Input: * buf: C string containing an unsigned * integer value representing a bound * but unconnected socket that is to be * used as an NFSD listener; listen(3) * must be called for a SOCK_STREAM * socket, otherwise it is ignored * size: non-zero length of C string in @buf * Output: * On success: NFS service is started; * passed-in buffer filled with a '\n'-terminated C * string containing a unique alphanumeric name of * the listener; * return code is the size in bytes of the string * On error: return code is a negative errno value * * OR * * Input: * buf: C string containing a transport * name and an unsigned integer value * representing the port to listen on, * separated by whitespace * size: non-zero length of C string in @buf * Output: * On success: returns zero; NFS service is started * On error: return code is a negative errno value */ static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __write_ports(file, buf, size, netns(file)); mutex_unlock(&nfsd_mutex); return rv; } int nfsd_max_blksize; /* * write_maxblksize - Set or report the current NFS blksize * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the new * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string * containing numeric value of the current NFS blksize * setting; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { int bsize; int rv = get_int(&mesg, &bsize); if (rv) return rv; trace_nfsd_ctl_maxblksize(netns(file), bsize); /* force bsize into allowed range and * required alignment. */ bsize = max_t(int, bsize, 1024); bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } nfsd_max_blksize = bsize; mutex_unlock(&nfsd_mutex); } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", nfsd_max_blksize); } #ifdef CONFIG_NFSD_V4 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; if (size > 0) { if (nn->nfsd_serv) return -EBUSY; rv = get_int(&mesg, &i); if (rv) return rv; trace_nfsd_ctl_time(netns(file), dentry->d_name.name, dentry->d_name.len, i); /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the * extremes are: * - Too short: the briefest network outage may * cause clients to lose all their locks. Also, * the frequent polling may be wasteful. * - Too long: do you really want reboot recovery * to take more than an hour? Or to make other * clients wait an hour before being able to * revoke a dead client's locks? */ if (i < 10 || i > 3600) return -EINVAL; *time = i; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%lld\n", *time); } static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { ssize_t rv; mutex_lock(&nfsd_mutex); rv = __nfsd4_write_time(file, buf, size, time, nn); mutex_unlock(&nfsd_mutex); return rv; } /* * write_leasetime - Set or report the current NFSv4 lease time * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing an unsigned * integer value representing the new * NFSv4 lease expiry time * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing unsigned integer value of the * current lease expiry time; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } /* * write_gracetime - Set or report current NFSv4 grace period time * * As above, but sets the time of the NFSv4 grace period. * * Note this should never be set to less than the *previous* * lease-period time, but we don't try to enforce this. (In the common * case (a new boot), we don't know what the previous lease time was * anyway.) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, struct nfsd_net *nn) { char *mesg = buf; char *recdir; int len, status; if (size > 0) { if (nn->nfsd_serv) return -EBUSY; if (size > PATH_MAX || buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; recdir = mesg; len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) return status; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", nfs4_recoverydir()); } /* * write_recoverydir - Set or report the pathname of the recovery directory * * Input: * buf: ignored * size: zero * * OR * * Input: * buf: C string containing the pathname * of the directory on a local file * system containing permanent NFSv4 * recovery data * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string * containing the current recovery pathname setting; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); mutex_lock(&nfsd_mutex); rv = __write_recoverydir(file, buf, size, nn); mutex_unlock(&nfsd_mutex); return rv; } #endif /* * write_v4_end_grace - release grace period for nfsd's v4.x lock manager * * Input: * buf: ignored * size: zero * OR * * Input: * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline * and NULL-terminated C string. This indicates whether * the grace period has ended in the current net * namespace. Return code is the size in bytes of the * string. Writing a string that starts with 'Y', 'y', or * '1' to the file will end the grace period for nfsd's v4 * lock manager. */ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) { struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { switch(buf[0]) { case 'Y': case 'y': case '1': if (!nn->nfsd_serv) return -EBUSY; trace_nfsd_end_grace(netns(file)); nfsd4_end_grace(nn); break; default: return -EINVAL; } } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", nn->grace_ended ? 'Y' : 'N'); } #endif /*----------------------------------------------------------------------------*/ /* * populating the filesystem. */ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (inode) { /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; simple_inode_init_ts(inode); } return inode; } static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) { struct inode *dir = parent->d_inode; struct dentry *dentry; struct inode *inode; inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600); if (!inode) return ERR_PTR(-ENOMEM); dentry = simple_start_creating(parent, name); if (IS_ERR(dentry)) { iput(inode); return dentry; } inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); if (ncl) { inode->i_private = ncl; kref_get(&ncl->cl_ref); } d_instantiate(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); inode_unlock(dir); return dentry; } #if IS_ENABLED(CONFIG_SUNRPC_GSS) /* * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. */ static void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { struct inode *dir = parent->d_inode; struct inode *inode; struct dentry *dentry; inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777); if (!inode) return; dentry = simple_start_creating(parent, name); if (IS_ERR(dentry)) { iput(inode); return; } inode->i_op = &simple_symlink_inode_operations; inode->i_link = (char *)content; inode->i_size = strlen(content); d_instantiate(dentry, inode); fsnotify_create(dir, dentry); inode_unlock(dir); } #else static inline void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { } #endif static void clear_ncl(struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct nfsdfs_client *ncl = inode->i_private; spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); kref_put(&ncl->cl_ref, ncl->cl_release); } struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; spin_lock(&inode->i_lock); nc = inode->i_private; if (nc) kref_get(&nc->cl_ref); spin_unlock(&inode->i_lock); return nc; } /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); struct dentry *dentry; for (int i = 0; files->name && files->name[0]; i++, files++) { struct inode *inode = nfsd_get_inode(root->d_sb, S_IFREG | files->mode); if (!inode) return -ENOMEM; dentry = simple_start_creating(root, files->name); if (IS_ERR(dentry)) { iput(inode); return PTR_ERR(dentry); } kref_get(&ncl->cl_ref); inode->i_fop = files->ops; inode->i_private = ncl; d_instantiate(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) fdentries[i] = dentry; inode_unlock(dir); } return 0; } /* on success, returns positive number unique to that client. */ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, struct nfsdfs_client *ncl, u32 id, const struct tree_descr *files, struct dentry **fdentries) { struct dentry *dentry; char name[11]; int ret; sprintf(name, "%u", id); dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; ret = nfsdfs_create_files(dentry, files, ncl, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; } return dentry; } /* Taken from __rpc_rmdir: */ void nfsd_client_rmdir(struct dentry *dentry) { simple_recursive_removal(dentry, clear_ncl); } static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) { struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); struct dentry *dentry; int ret; static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, /* Per-export io stats use same ops as exports file */ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_fops, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &nfsd_reply_cache_stats_fops, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, #endif [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} }; ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); nn->nfsd_client_dir = dentry; return 0; } static int nfsd_fs_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); } static void nfsd_fs_free_fc(struct fs_context *fc) { if (fc->s_fs_info) put_net(fc->s_fs_info); } static const struct fs_context_operations nfsd_fs_context_ops = { .free = nfsd_fs_free_fc, .get_tree = nfsd_fs_get_tree, }; static int nfsd_init_fs_context(struct fs_context *fc) { put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(fc->net_ns->user_ns); fc->ops = &nfsd_fs_context_ops; return 0; } static void nfsd_umount(struct super_block *sb) { struct net *net = sb->s_fs_info; nfsd_shutdown_threads(net); kill_litter_super(sb); put_net(net); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", .init_fs_context = nfsd_init_fs_context, .kill_sb = nfsd_umount, }; MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS static int exports_proc_open(struct inode *inode, struct file *file) { return exports_net_open(current->nsproxy->net_ns, file); } static const struct proc_ops exports_proc_ops = { .proc_open = exports_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, }; static int create_proc_exports_entry(void) { struct proc_dir_entry *entry; entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; } return 0; } #else /* CONFIG_PROC_FS */ static int create_proc_exports_entry(void) { return 0; } #endif unsigned int nfsd_net_id; static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, struct nfsd_genl_rqstp *genl_rqstp) { void *hdr; u32 i; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET); if (!hdr) return -ENOBUFS; if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, genl_rqstp->rq_xid) || nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, genl_rqstp->rq_flags) || nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, genl_rqstp->rq_prog) || nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, genl_rqstp->rq_proc) || nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, genl_rqstp->rq_vers) || nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, ktime_to_us(genl_rqstp->rq_stime), NFSD_A_RPC_STATUS_PAD)) return -ENOBUFS; switch (genl_rqstp->rq_saddr.sa_family) { case AF_INET: { const struct sockaddr_in *s_in, *d_in; s_in = (const struct sockaddr_in *)&genl_rqstp->rq_saddr; d_in = (const struct sockaddr_in *)&genl_rqstp->rq_daddr; if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, s_in->sin_addr.s_addr) || nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, d_in->sin_addr.s_addr) || nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, s_in->sin_port) || nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, d_in->sin_port)) return -ENOBUFS; break; } case AF_INET6: { const struct sockaddr_in6 *s_in, *d_in; s_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_saddr; d_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_daddr; if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, &s_in->sin6_addr) || nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, &d_in->sin6_addr) || nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, s_in->sin6_port) || nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, d_in->sin6_port)) return -ENOBUFS; break; } } for (i = 0; i < genl_rqstp->rq_opcnt; i++) if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, genl_rqstp->rq_opnum[i])) return -ENOBUFS; genlmsg_end(skb, hdr); return 0; } /** * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit * @skb: reply buffer * @cb: netlink metadata and command arguments * * Returns the size of the reply or a negative errno. */ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { int i, ret, rqstp_index = 0; struct nfsd_net *nn; mutex_lock(&nfsd_mutex); nn = net_generic(sock_net(skb->sk), nfsd_net_id); if (!nn->nfsd_serv) { ret = -ENODEV; goto out_unlock; } rcu_read_lock(); for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { struct svc_rqst *rqstp; if (i < cb->args[0]) /* already consumed */ continue; rqstp_index = 0; list_for_each_entry_rcu(rqstp, &nn->nfsd_serv->sv_pools[i].sp_all_threads, rq_all) { struct nfsd_genl_rqstp genl_rqstp; unsigned int status_counter; if (rqstp_index++ < cb->args[1]) /* already consumed */ continue; /* * Acquire rq_status_counter before parsing the rqst * fields. rq_status_counter is set to an odd value in * order to notify the consumers the rqstp fields are * meaningful. */ status_counter = smp_load_acquire(&rqstp->rq_status_counter); if (!(status_counter & 1)) continue; genl_rqstp.rq_xid = rqstp->rq_xid; genl_rqstp.rq_flags = rqstp->rq_flags; genl_rqstp.rq_vers = rqstp->rq_vers; genl_rqstp.rq_prog = rqstp->rq_prog; genl_rqstp.rq_proc = rqstp->rq_proc; genl_rqstp.rq_stime = rqstp->rq_stime; genl_rqstp.rq_opcnt = 0; memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp), sizeof(struct sockaddr)); memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp), sizeof(struct sockaddr)); #ifdef CONFIG_NFSD_V4 if (rqstp->rq_vers == NFS4_VERSION && rqstp->rq_proc == NFSPROC4_COMPOUND) { /* NFSv4 compound */ struct nfsd4_compoundargs *args; int j; args = rqstp->rq_argp; genl_rqstp.rq_opcnt = min_t(u32, args->opcnt, ARRAY_SIZE(genl_rqstp.rq_opnum)); for (j = 0; j < genl_rqstp.rq_opcnt; j++) genl_rqstp.rq_opnum[j] = args->ops[j].opnum; } #endif /* CONFIG_NFSD_V4 */ /* * Acquire rq_status_counter before reporting the rqst * fields to the user. */ if (smp_load_acquire(&rqstp->rq_status_counter) != status_counter) continue; ret = nfsd_genl_rpc_status_compose_msg(skb, cb, &genl_rqstp); if (ret) goto out; } } cb->args[0] = i; cb->args[1] = rqstp_index; ret = skb->len; out: rcu_read_unlock(); out_unlock: mutex_unlock(&nfsd_mutex); return ret; } /** * nfsd_nl_threads_set_doit - set the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) { int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem; struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct nlattr *attr; const char *scope = NULL; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS)) return -EINVAL; /* count number of SERVER_THREADS values */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, GENL_HDRLEN, rem) nrpools++; mutex_lock(&nfsd_mutex); nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); if (!nthreads) { ret = -ENOMEM; goto out_unlock; } i = 0; nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, GENL_HDRLEN, rem) { nthreads[i++] = nla_get_u32(attr); if (i >= nrpools) break; } if (info->attrs[NFSD_A_SERVER_GRACETIME] || info->attrs[NFSD_A_SERVER_LEASETIME] || info->attrs[NFSD_A_SERVER_SCOPE]) { ret = -EBUSY; if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) goto out_unlock; ret = -EINVAL; attr = info->attrs[NFSD_A_SERVER_GRACETIME]; if (attr) { u32 gracetime = nla_get_u32(attr); if (gracetime < 10 || gracetime > 3600) goto out_unlock; nn->nfsd4_grace = gracetime; } attr = info->attrs[NFSD_A_SERVER_LEASETIME]; if (attr) { u32 leasetime = nla_get_u32(attr); if (leasetime < 10 || leasetime > 3600) goto out_unlock; nn->nfsd4_lease = leasetime; } attr = info->attrs[NFSD_A_SERVER_SCOPE]; if (attr) scope = nla_data(attr); } ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope); if (ret > 0) ret = 0; out_unlock: mutex_unlock(&nfsd_mutex); kfree(nthreads); return ret; } /** * nfsd_nl_threads_get_doit - get the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); void *hdr; int err; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME, nn->nfsd4_grace) || nla_put_u32(skb, NFSD_A_SERVER_LEASETIME, nn->nfsd4_lease) || nla_put_string(skb, NFSD_A_SERVER_SCOPE, nn->nfsd_name); if (err) goto err_unlock; if (nn->nfsd_serv) { int i; for (i = 0; i < nfsd_nrpools(net); ++i) { struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i]; err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, sp->sp_nrthreads); if (err) goto err_unlock; } } else { err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0); if (err) goto err_unlock; } mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_unlock: mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_version_set_doit - set the nfs enabled versions * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct nlattr *attr; struct nfsd_net *nn; int i, rem; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION)) return -EINVAL; mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); if (nn->nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } /* clear current supported versions. */ nfsd_vers(nn, 2, NFSD_CLEAR); nfsd_vers(nn, 3, NFSD_CLEAR); for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) nfsd_minorversion(nn, i, NFSD_CLEAR); nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; u32 major, minor = 0; bool enabled; if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, nfsd_version_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_VERSION_MAJOR]) continue; major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]); if (tb[NFSD_A_VERSION_MINOR]) minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]); enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]); switch (major) { case 4: nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR); break; case 3: case 2: if (!minor) nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR); break; default: break; } } mutex_unlock(&nfsd_mutex); return 0; } /** * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info) { struct nfsd_net *nn; int i, err; void *hdr; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); for (i = 2; i <= 4; i++) { int j; for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) { struct nlattr *attr; /* Don't record any versions the kernel doesn't have * compiled in */ if (!nfsd_support_version(i)) continue; /* NFSv{2,3} does not support minor numbers */ if (i < 4 && j) continue; attr = nla_nest_start(skb, NFSD_A_SERVER_PROTO_VERSION); if (!attr) { err = -EINVAL; goto err_nfsd_unlock; } if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) || nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) { err = -EINVAL; goto err_nfsd_unlock; } /* Set the enabled flag if the version is enabled */ if (nfsd_vers(nn, i, NFSD_TEST) && (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) && nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) { err = -EINVAL; goto err_nfsd_unlock; } nla_nest_end(skb, attr); } } mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_nfsd_unlock: mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_listener_set_doit - set the nfs running sockets * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct svc_xprt *xprt, *tmp; const struct nlattr *attr; struct svc_serv *serv; LIST_HEAD(permsocks); struct nfsd_net *nn; bool delete = false; int err, rem; mutex_lock(&nfsd_mutex); err = nfsd_create_serv(net); if (err) { mutex_unlock(&nfsd_mutex); return err; } nn = net_generic(net, nfsd_net_id); serv = nn->nfsd_serv; spin_lock_bh(&serv->sv_lock); /* Move all of the old listener sockets to a temp list */ list_splice_init(&serv->sv_permsocks, &permsocks); /* * Walk the list of server_socks from userland and move any that match * back to sv_permsocks */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) continue; if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) continue; xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); sa = nla_data(tb[NFSD_A_SOCK_ADDR]); /* Put back any matching sockets */ list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) { /* This shouldn't be possible */ if (WARN_ON_ONCE(xprt->xpt_net != net)) { list_move(&xprt->xpt_list, &serv->sv_permsocks); continue; } /* If everything matches, put it back */ if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) && rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) { list_move(&xprt->xpt_list, &serv->sv_permsocks); break; } } } /* * If there are listener transports remaining on the permsocks list, * it means we were asked to remove a listener. */ if (!list_empty(&permsocks)) { list_splice_init(&permsocks, &serv->sv_permsocks); delete = true; } spin_unlock_bh(&serv->sv_lock); /* Do not remove listeners while there are active threads. */ if (serv->sv_nrthreads) { err = -EBUSY; goto out_unlock_mtx; } /* * Since we can't delete an arbitrary llist entry, destroy the * remaining listeners and recreate the list. */ if (delete) svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; int ret; if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) continue; if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) continue; xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); sa = nla_data(tb[NFSD_A_SOCK_ADDR]); xprt = svc_find_listener(serv, xcl_name, net, sa); if (xprt) { if (delete) WARN_ONCE(1, "Transport type=%s already exists\n", xcl_name); svc_xprt_put(xprt); continue; } ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, get_current_cred()); /* always save the latest error */ if (ret < 0) err = ret; } if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) nfsd_destroy_serv(net); out_unlock_mtx: mutex_unlock(&nfsd_mutex); return err; } /** * nfsd_nl_listener_get_doit - get the nfs running listeners * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info) { struct svc_xprt *xprt; struct svc_serv *serv; struct nfsd_net *nn; void *hdr; int err; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_iput(skb, info); if (!hdr) { err = -EMSGSIZE; goto err_free_msg; } mutex_lock(&nfsd_mutex); nn = net_generic(genl_info_net(info), nfsd_net_id); /* no nfs server? Just send empty socket list */ if (!nn->nfsd_serv) goto out_unlock_mtx; serv = nn->nfsd_serv; spin_lock_bh(&serv->sv_lock); list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { struct nlattr *attr; attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR); if (!attr) { err = -EINVAL; goto err_serv_unlock; } if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME, xprt->xpt_class->xcl_name) || nla_put(skb, NFSD_A_SOCK_ADDR, sizeof(struct sockaddr_storage), &xprt->xpt_local)) { err = -EINVAL; goto err_serv_unlock; } nla_nest_end(skb, attr); } spin_unlock_bh(&serv->sv_lock); out_unlock_mtx: mutex_unlock(&nfsd_mutex); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_serv_unlock: spin_unlock_bh(&serv->sv_lock); mutex_unlock(&nfsd_mutex); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_nl_pool_mode_set_doit - set the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NFSD_A_POOL_MODE_MODE)) return -EINVAL; attr = info->attrs[NFSD_A_POOL_MODE_MODE]; return sunrpc_set_pool_mode(nla_data(attr)); } /** * nfsd_nl_pool_mode_get_doit - get info about pool_mode * @skb: reply buffer * @info: netlink metadata and command arguments * * Return 0 on success or a negative errno. */ int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); char buf[16]; void *hdr; int err; if (sunrpc_get_pool_mode(buf, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) return -ERANGE; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; err = -EMSGSIZE; hdr = genlmsg_iput(skb, info); if (!hdr) goto err_free_msg; err = nla_put_string(skb, NFSD_A_POOL_MODE_MODE, buf) | nla_put_u32(skb, NFSD_A_POOL_MODE_NPOOLS, nfsd_nrpools(net)); if (err) goto err_free_msg; genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err_free_msg: nlmsg_free(skb); return err; } /** * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace * @net: a freshly-created network namespace * * This information stays around as long as the network namespace is * alive whether or not there is an NFSD instance running in the * namespace. * * Returns zero on success, or a negative errno otherwise. */ static __net_init int nfsd_net_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int retval; int i; retval = nfsd_export_init(net); if (retval) goto out_export_error; retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; retval = percpu_counter_init_many(nn->counter, 0, GFP_KERNEL, NFSD_STATS_COUNTERS_NUM); if (retval) goto out_repcache_error; memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); nn->nfsd_svcstats.program = &nfsd_programs[0]; if (!nfsd_proc_stat_init(net)) { retval = -ENOMEM; goto out_proc_error; } for (i = 0; i < sizeof(nn->nfsd_versions); i++) nn->nfsd_versions[i] = nfsd_support_version(i); for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++) nn->nfsd4_minorversions[i] = nfsd_support_version(4); nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); #if IS_ENABLED(CONFIG_NFS_LOCALIO) spin_lock_init(&nn->local_clients_lock); INIT_LIST_HEAD(&nn->local_clients); #endif return 0; out_proc_error: percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); out_repcache_error: nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } #if IS_ENABLED(CONFIG_NFS_LOCALIO) /** * nfsd_net_pre_exit - Disconnect localio clients from net namespace * @net: a network namespace that is about to be destroyed * * This invalidates ->net pointers held by localio clients * while they can still safely access nn->counter. */ static __net_exit void nfsd_net_pre_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs_localio_invalidate_clients(&nn->local_clients, &nn->local_clients_lock); } #endif /** * nfsd_net_exit - Release the nfsd_net portion of a net namespace * @net: a network namespace that is about to be destroyed * */ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); } static struct pernet_operations nfsd_net_ops = { .init = nfsd_net_init, #if IS_ENABLED(CONFIG_NFS_LOCALIO) .pre_exit = nfsd_net_pre_exit, #endif .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; static int __init init_nfsd(void) { int retval; nfsd_debugfs_init(); retval = nfsd4_init_slabs(); if (retval) return retval; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; retval = nfsd_drc_slab_create(); if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) goto out_free_lockd; retval = register_cld_notifier(); if (retval) goto out_free_subsys; retval = nfsd4_create_laundry_wq(); if (retval) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); if (retval) goto out_free_filesystem; retval = create_proc_exports_entry(); if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: genl_unregister_family(&nfsd_nl_family); out_free_filesystem: unregister_filesystem(&nfsd_fs_type); out_free_nfsd4: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); nfsd_debugfs_exit(); return retval; } static void __exit exit_nfsd(void) { remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd)
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "timers.h" #include "device.h" #include "peer.h" #include "queueing.h" #include "socket.h" /* * - Timer for retransmitting the handshake if we don't hear back after * `REKEY_TIMEOUT + jitter` ms. * * - Timer for sending empty packet if we have received a packet but after have * not sent one for `KEEPALIVE_TIMEOUT` ms. * * - Timer for initiating new handshake if we have sent a packet but after have * not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) + * jitter` ms. * * - Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms * if no new keys have been received. * * - Timer for, if enabled, sending an empty authenticated packet every user- * specified seconds. */ static inline void mod_peer_timer(struct wg_peer *peer, struct timer_list *timer, unsigned long expires) { rcu_read_lock_bh(); if (likely(netif_running(peer->device->dev) && !READ_ONCE(peer->is_dead))) mod_timer(timer, expires); rcu_read_unlock_bh(); } static void wg_expired_retransmit_handshake(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_retransmit_handshake); if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)MAX_TIMER_HANDSHAKES + 2); timer_delete(&peer->timer_send_keepalive); /* We drop all packets without a keypair and don't try again, * if we try unsuccessfully for too long to make a handshake. */ wg_packet_purge_staged_packets(peer); /* We set a timer for destroying any residue that might be left * of a partial exchange. */ if (!timer_pending(&peer->timer_zero_key_material)) mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } else { ++peer->timer_handshake_attempts; pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REKEY_TIMEOUT, peer->timer_handshake_attempts + 1); /* We clear the endpoint address src address, in case this is * the cause of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, true); } } static void wg_expired_send_keepalive(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_send_keepalive); wg_packet_send_keepalive(peer); if (peer->timer_need_another_keepalive) { peer->timer_need_another_keepalive = false; mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); } } static void wg_expired_new_handshake(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_new_handshake); pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)); /* We clear the endpoint address src address, in case this is the cause * of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, false); } static void wg_expired_zero_key_material(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_zero_key_material); rcu_read_lock_bh(); if (!READ_ONCE(peer->is_dead)) { wg_peer_get(peer); if (!queue_work(peer->device->handshake_send_wq, &peer->clear_peer_work)) /* If the work was already on the queue, we want to drop * the extra reference. */ wg_peer_put(peer); } rcu_read_unlock_bh(); } static void wg_queued_expired_zero_key_material(struct work_struct *work) { struct wg_peer *peer = container_of(work, struct wg_peer, clear_peer_work); pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REJECT_AFTER_TIME * 3); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_peer_put(peer); } static void wg_expired_send_persistent_keepalive(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_persistent_keepalive); if (likely(peer->persistent_keepalive_interval)) wg_packet_send_keepalive(peer); } /* Should be called after an authenticated data packet is sent. */ void wg_timers_data_sent(struct wg_peer *peer) { if (!timer_pending(&peer->timer_new_handshake)) mod_peer_timer(peer, &peer->timer_new_handshake, jiffies + (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after an authenticated data packet is received. */ void wg_timers_data_received(struct wg_peer *peer) { if (likely(netif_running(peer->device->dev))) { if (!timer_pending(&peer->timer_send_keepalive)) mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); else peer->timer_need_another_keepalive = true; } } /* Should be called after any type of authenticated packet is sent, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer) { timer_delete(&peer->timer_send_keepalive); } /* Should be called after any type of authenticated packet is received, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_received(struct wg_peer *peer) { timer_delete(&peer->timer_new_handshake); } /* Should be called after a handshake initiation message is sent. */ void wg_timers_handshake_initiated(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_retransmit_handshake, jiffies + REKEY_TIMEOUT * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after a handshake response message is received and processed * or when getting key confirmation via the first data message. */ void wg_timers_handshake_complete(struct wg_peer *peer) { timer_delete(&peer->timer_retransmit_handshake); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; ktime_get_real_ts64(&peer->walltime_last_handshake); } /* Should be called after an ephemeral key is created, which is before sending a * handshake response or after receiving a handshake response. */ void wg_timers_session_derived(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } /* Should be called before a packet with authentication, whether * keepalive, data, or handshakem is sent, or after one is received. */ void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer) { if (peer->persistent_keepalive_interval) mod_peer_timer(peer, &peer->timer_persistent_keepalive, jiffies + peer->persistent_keepalive_interval * HZ); } void wg_timers_init(struct wg_peer *peer) { timer_setup(&peer->timer_retransmit_handshake, wg_expired_retransmit_handshake, 0); timer_setup(&peer->timer_send_keepalive, wg_expired_send_keepalive, 0); timer_setup(&peer->timer_new_handshake, wg_expired_new_handshake, 0); timer_setup(&peer->timer_zero_key_material, wg_expired_zero_key_material, 0); timer_setup(&peer->timer_persistent_keepalive, wg_expired_send_persistent_keepalive, 0); INIT_WORK(&peer->clear_peer_work, wg_queued_expired_zero_key_material); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; peer->timer_need_another_keepalive = false; } void wg_timers_stop(struct wg_peer *peer) { timer_delete_sync(&peer->timer_retransmit_handshake); timer_delete_sync(&peer->timer_send_keepalive); timer_delete_sync(&peer->timer_new_handshake); timer_delete_sync(&peer->timer_zero_key_material); timer_delete_sync(&peer->timer_persistent_keepalive); flush_work(&peer->clear_peer_work); }
1600 18 9 9 567 570 567 570 773 773 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return 0; } /* * OK, there are some permission event watchers. Check if anybody is * watching for permission events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, ALL_FSNOTIFY_PERM_EVENTS); if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask |= fsnotify_inode_watches_children(d_inode(parent)); dput(parent); } /* * Legacy FAN_ACCESS_PERM events have very high performance overhead, * so unlikely to be used in the wild. If they are used there will be * no optimizations at all. */ if (unlikely(p_mask & FS_ACCESS_PERM)) { /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); goto open_perm; } /* * Pre-content events are only supported on regular files. * If there are pre-content event watchers and no permission access * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. * That is the common case with HSM service. */ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); goto open_perm; } /* Nobody watching permission and pre-content events on this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); open_perm: /* * Send open perm events depending on object masks and regardless of * FMODE_NONOTIFY_PERM. */ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } if (p_mask & FS_OPEN_PERM) return fsnotify_path(&file->f_path, FS_OPEN_PERM); return 0; } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
484 484 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> #include <linux/instrumented.h> #include <linux/string.h> #include <linux/types.h> #include <asm/mce.h> #ifdef CONFIG_X86_MCE static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key); void enable_copy_mc_fragile(void) { static_branch_inc(&copy_mc_fragile_key); } #define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key)) /* * Similar to copy_user_handle_tail, probe for the write fault point, or * source exception point. */ __visible notrace unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len) { for (; len; --len, to++, from++) if (copy_mc_fragile(to, from, 1)) break; return len; } #else /* * No point in doing careful copying, or consulting a static key when * there is no #MC handler in the CONFIG_X86_MCE=n case. */ void enable_copy_mc_fragile(void) { } #define copy_mc_fragile_enabled (0) #endif unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len); /** * copy_mc_to_kernel - memory copy that handles source exceptions * * @dst: destination address * @src: source address * @len: number of bytes to copy * * Call into the 'fragile' version on systems that benefit from avoiding * corner case poison consumption scenarios, For example, accessing * poison across 2 cachelines with a single instruction. Almost all * other uses case can use copy_mc_enhanced_fast_string() for a fast * recoverable copy, or fallback to plain memcpy. * * Return 0 for success, or number of bytes not copied if there was an * exception. */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_memcpy_before(dst, src, len); ret = copy_mc_fragile(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_memcpy_before(dst, src, len); ret = copy_mc_enhanced_fast_string(dst, src, len); instrument_memcpy_after(dst, src, len, ret); return ret; } memcpy(dst, src, len); return 0; } EXPORT_SYMBOL_GPL(copy_mc_to_kernel); unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len) { unsigned long ret; if (copy_mc_fragile_enabled) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); return ret; } if (static_cpu_has(X86_FEATURE_ERMS)) { instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); return ret; } return copy_user_generic((__force void *)dst, src, len); }
4 4 4 2 3 3 3 3 4 4 1 3 3 3 3 1 1 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2022 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Fair Capacity and Weighted Fair Queueing handling * RFC 8260 section 3.5 and 3.6 */ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream); static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; if (!weight) return -EINVAL; soute->fc_weight = weight; return 0; } static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; *value = soute->fc_weight; return 0; } static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid, __u16 weight, gfp_t gfp) { return 0; } static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { return 0; } static int sctp_sched_fc_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->fc_list); return 0; } static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&soute->fc_list); soute->fc_length = 0; soute->fc_weight = 1; return 0; } static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fc_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_out_ext *pos; if (!list_empty(&soute->fc_list)) return; list_for_each_entry(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_add_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) return NULL; /* Find which chunk is next */ if (stream->out_curr) soute = stream->out_curr->ext; else soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list); ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); return ch; } static void sctp_sched_fc_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_out_ext *soute, *pos; __u16 sid, i; sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(stream, sid)->ext; /* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */ if (soute->fc_length > U32_MAX - ch->skb->len) { for (i = 0; i < stream->outcnt; i++) { pos = SCTP_SO(stream, i)->ext; if (!pos) continue; if (pos->fc_length <= (U32_MAX >> 2)) { pos->fc_length = 0; continue; } pos->fc_length -= (U32_MAX >> 2); } } soute->fc_length += ch->skb->len; if (list_empty(&soute->outq)) { list_del_init(&soute->fc_list); return; } pos = soute; list_for_each_entry_continue(pos, &stream->fc_list, fc_list) if ((__u64)pos->fc_length * soute->fc_weight >= (__u64)soute->fc_length * pos->fc_weight) break; list_move_tail(&soute->fc_list, &pos->fc_list); } static void sctp_sched_fc_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (SCTP_SO(stream, sid)->ext) sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext); } } static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) { struct sctp_stream_out_ext *soute, *tmp; list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list) list_del_init(&soute->fc_list); } static const struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_fc_init(void) { sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } static const struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, .init_sid = sctp_sched_fc_init_sid, .free_sid = sctp_sched_fc_free_sid, .enqueue = sctp_sched_fc_enqueue, .dequeue = sctp_sched_fc_dequeue, .dequeue_done = sctp_sched_fc_dequeue_done, .sched_all = sctp_sched_fc_sched_all, .unsched_all = sctp_sched_fc_unsched_all, }; void sctp_sched_ops_wfq_init(void) { sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq); }
5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template_lib.c * Library of supported template fields. */ #include "ima_template_lib.h" #include <linux/xattr.h> #include <linux/evm.h> static bool ima_template_hash_algo_allowed(u8 algo) { if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) return true; return false; } enum data_formats { DATA_FMT_DIGEST = 0, DATA_FMT_DIGEST_WITH_ALGO, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, DATA_FMT_STRING, DATA_FMT_HEX, DATA_FMT_UINT }; enum digest_type { DIGEST_TYPE_IMA, DIGEST_TYPE_VERITY, DIGEST_TYPE__LAST }; #define DIGEST_TYPE_NAME_LEN_MAX 7 /* including NUL */ static const char * const digest_type_name[DIGEST_TYPE__LAST] = { [DIGEST_TYPE_IMA] = "ima", [DIGEST_TYPE_VERITY] = "verity" }; static int ima_write_template_field_data(const void *data, const u32 datalen, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf, *buf_ptr; u32 buflen = datalen; if (datafmt == DATA_FMT_STRING) buflen = datalen + 1; buf = kzalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, data, datalen); /* * Replace all space characters with underscore for event names and * strings. This avoid that, during the parsing of a measurements list, * filenames with spaces or that end with the suffix ' (deleted)' are * split into multiple template fields (the space is the delimitator * character for measurements lists in ASCII format). */ if (datafmt == DATA_FMT_STRING) { for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) if (*buf_ptr == ' ') *buf_ptr = '_'; } field_data->data = buf; field_data->len = buflen; return 0; } static void ima_show_template_data_ascii(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf_ptr = field_data->data; u32 buflen = field_data->len; switch (datafmt) { case DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: case DATA_FMT_DIGEST_WITH_ALGO: buf_ptr = strrchr(field_data->data, ':'); if (buf_ptr != field_data->data) seq_printf(m, "%s", field_data->data); /* skip ':' and '\0' */ buf_ptr += 2; buflen -= buf_ptr - field_data->data; fallthrough; case DATA_FMT_DIGEST: case DATA_FMT_HEX: if (!buflen) break; ima_print_digest(m, buf_ptr, buflen); break; case DATA_FMT_STRING: seq_printf(m, "%s", buf_ptr); break; case DATA_FMT_UINT: switch (field_data->len) { case sizeof(u8): seq_printf(m, "%u", *(u8 *)buf_ptr); break; case sizeof(u16): if (ima_canonical_fmt) seq_printf(m, "%u", le16_to_cpu(*(__le16 *)buf_ptr)); else seq_printf(m, "%u", *(u16 *)buf_ptr); break; case sizeof(u32): if (ima_canonical_fmt) seq_printf(m, "%u", le32_to_cpu(*(__le32 *)buf_ptr)); else seq_printf(m, "%u", *(u32 *)buf_ptr); break; case sizeof(u64): if (ima_canonical_fmt) seq_printf(m, "%llu", le64_to_cpu(*(__le64 *)buf_ptr)); else seq_printf(m, "%llu", *(u64 *)buf_ptr); break; default: break; } break; default: break; } } static void ima_show_template_data_binary(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { u32 field_len = !ima_canonical_fmt ? len : (__force u32)cpu_to_le32(len); ima_putc(m, &field_len, sizeof(field_len)); } if (!len) return; ima_putc(m, field_data->data, len); } static void ima_show_template_field_data(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { switch (show) { case IMA_SHOW_ASCII: ima_show_template_data_ascii(m, show, datafmt, field_data); break; case IMA_SHOW_BINARY: case IMA_SHOW_BINARY_NO_FIELD_LEN: case IMA_SHOW_BINARY_OLD_STRING_FMT: ima_show_template_data_binary(m, show, datafmt, field_data); break; default: break; } } void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); } void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, field_data); } void ima_show_template_digest_ngv2(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, field_data); } void ima_show_template_string(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); } void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_buf(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_uint(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_UINT, field_data); } /** * ima_parse_buf() - Parses lengths and data from an input buffer * @bufstartp: Buffer start address. * @bufendp: Buffer end address. * @bufcurp: Pointer to remaining (non-parsed) data. * @maxfields: Length of fields array. * @fields: Array containing lengths and pointers of parsed data. * @curfields: Number of array items containing parsed data. * @len_mask: Bitmap (if bit is set, data length should not be parsed). * @enforce_mask: Check if curfields == maxfields and/or bufcurp == bufendp. * @bufname: String identifier of the input buffer. * * Return: 0 on success, -EINVAL on error. */ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, int maxfields, struct ima_field_data *fields, int *curfields, unsigned long *len_mask, int enforce_mask, char *bufname) { void *bufp = bufstartp; int i; for (i = 0; i < maxfields; i++) { if (len_mask == NULL || !test_bit(i, len_mask)) { if (bufp > (bufendp - sizeof(u32))) break; if (ima_canonical_fmt) fields[i].len = le32_to_cpu(*(__le32 *)bufp); else fields[i].len = *(u32 *)bufp; bufp += sizeof(u32); } if (bufp > (bufendp - fields[i].len)) break; fields[i].data = bufp; bufp += fields[i].len; } if ((enforce_mask & ENFORCE_FIELDS) && i != maxfields) { pr_err("%s: nr of fields mismatch: expected: %d, current: %d\n", bufname, maxfields, i); return -EINVAL; } if ((enforce_mask & ENFORCE_BUFEND) && bufp != bufendp) { pr_err("%s: buf end mismatch: expected: %p, current: %p\n", bufname, bufendp, bufp); return -EINVAL; } if (curfields) *curfields = i; if (bufcurp) *bufcurp = bufp; return 0; } static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, u8 digest_type, u8 hash_algo, struct ima_field_data *field_data) { /* * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: <hash algo> + ':' + '\0' + digest, * - DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: * <digest type> + ':' + <hash algo> + ':' + '\0' + digest, * * where 'DATA_FMT_DIGEST' is the original digest format ('d') * with a hash size limitation of 20 bytes, * where <digest type> is either "ima" or "verity", * where <hash algo> is the hash_algo_name[] string. */ u8 buffer[DIGEST_TYPE_NAME_LEN_MAX + CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; enum data_formats fmt = DATA_FMT_DIGEST; u32 offset = 0; if (digest_type < DIGEST_TYPE__LAST && hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO; offset += 1 + sprintf(buffer, "%s:%s:", digest_type_name[digest_type], hash_algo_name[hash_algo]); } else if (hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_ALGO; offset += 1 + sprintf(buffer, "%s:", hash_algo_name[hash_algo]); } if (digest) { memcpy(buffer + offset, digest, digestsize); } else { /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the * hash algorithm digest size. If the hash algorithm is not * specified increase the offset by IMA_DIGEST_SIZE which * fits SHA1 or MD5 */ if (hash_algo < HASH_ALGO__LAST) offset += hash_digest_size[hash_algo]; else offset += IMA_DIGEST_SIZE; } return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); } /* * This function writes the digest of an event (with size limit). */ int ima_eventdigest_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct ima_max_digest_data hash; struct ima_digest_data *hash_hdr = container_of(&hash.hdr, struct ima_digest_data, hdr); u8 *cur_digest = NULL; u32 cur_digestsize = 0; struct inode *inode; int result; memset(&hash, 0, sizeof(hash)); if (event_data->violation) /* recording a violation. */ goto out; if (ima_template_hash_algo_allowed(event_data->iint->ima_hash->algo)) { cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; goto out; } if ((const char *)event_data->filename == boot_aggregate_name) { if (ima_tpm_chip) { hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_boot_aggregate(hash_hdr); /* algo can change depending on available PCR banks */ if (!result && hash.hdr.algo != HASH_ALGO_SHA1) result = -EINVAL; if (result < 0) memset(&hash, 0, sizeof(hash)); } cur_digest = hash_hdr->digest; cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; goto out; } if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; inode = file_inode(event_data->file); hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? ima_hash_algo : HASH_ALGO_SHA1; result = ima_calc_file_hash(event_data->file, hash_hdr); if (result) { integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, event_data->filename, "collect_data", "failed", result, 0); return result; } cur_digest = hash_hdr->digest; cur_digestsize = hash.hdr.length; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, HASH_ALGO__LAST, field_data); } /* * This function writes the digest of an event (without size limit). */ int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } /* * This function writes the digest of an event (without size limit), * prefixed with both the digest type and hash algorithm. */ int ima_eventdigest_ngv2_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; u8 digest_type = DIGEST_TYPE_IMA; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; if (event_data->iint->flags & IMA_VERITY_REQUIRED) digest_type = DIGEST_TYPE_VERITY; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, digest_type, hash_algo, field_data); } /* * This function writes the digest of the file which is expected to match the * digest contained in the file's appended signature. */ int ima_eventdigest_modsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { enum hash_algo hash_algo; const u8 *cur_digest; u32 cur_digestsize; if (!event_data->modsig) return 0; if (event_data->violation) { /* Recording a violation. */ hash_algo = HASH_ALGO_SHA1; cur_digest = NULL; cur_digestsize = 0; } else { int rc; rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, &cur_digest, &cur_digestsize); if (rc) return rc; else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) /* There was some error collecting the digest. */ return -EINVAL; } return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) { const char *cur_filename = NULL; struct name_snapshot filename; u32 cur_filename_len = 0; bool snapshot = false; int ret; BUG_ON(event_data->filename == NULL && event_data->file == NULL); if (event_data->filename) { cur_filename = event_data->filename; cur_filename_len = strlen(event_data->filename); if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) goto out; } if (event_data->file) { take_dentry_name_snapshot(&filename, event_data->file->f_path.dentry); snapshot = true; cur_filename = filename.name.name; cur_filename_len = strlen(cur_filename); } else /* * Truncate filename if the latter is too long and * the file descriptor is not available. */ cur_filename_len = IMA_EVENT_NAME_LEN_MAX; out: ret = ima_write_template_field_data(cur_filename, cur_filename_len, DATA_FMT_STRING, field_data); if (snapshot) release_dentry_name_snapshot(&filename); return ret; } /* * This function writes the name of an event (with size limit). */ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, true); } /* * This function writes the name of an event (without size limit). */ int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, false); } /* * ima_eventsig_init - include the file signature as part of the template data */ int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_value = event_data->xattr_value; if (!xattr_value || (xattr_value->type != EVM_IMA_XATTR_DIGSIG && xattr_value->type != IMA_VERITY_DIGSIG)) return ima_eventevmsig_init(event_data, field_data); return ima_write_template_field_data(xattr_value, event_data->xattr_len, DATA_FMT_HEX, field_data); } /* * ima_eventbuf_init - include the buffer(kexec-cmldine) as part of the * template data. */ int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { if ((!event_data->buf) || (event_data->buf_len == 0)) return 0; return ima_write_template_field_data(event_data->buf, event_data->buf_len, DATA_FMT_HEX, field_data); } /* * ima_eventmodsig_init - include the appended file signature as part of the * template data */ int ima_eventmodsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { const void *data; u32 data_len; int rc; if (!event_data->modsig) return 0; /* * modsig is a runtime structure containing pointers. Get its raw data * instead. */ rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); if (rc) return rc; return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, field_data); } /* * ima_eventevmsig_init - include the EVM portable signature as part of the * template data */ int ima_eventevmsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_data = NULL; int rc = 0; if (!event_data->file) return 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)xattr_data, rc, DATA_FMT_HEX, field_data); out: kfree(xattr_data); return rc; } static int ima_eventinodedac_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool get_uid) { unsigned int id; if (!event_data->file) return 0; if (get_uid) id = i_uid_read(file_inode(event_data->file)); else id = i_gid_read(file_inode(event_data->file)); if (ima_canonical_fmt) { if (sizeof(id) == sizeof(u16)) id = (__force u16)cpu_to_le16(id); else id = (__force u32)cpu_to_le32(id); } return ima_write_template_field_data((void *)&id, sizeof(id), DATA_FMT_UINT, field_data); } /* * ima_eventinodeuid_init - include the inode UID as part of the template * data */ int ima_eventinodeuid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, true); } /* * ima_eventinodegid_init - include the inode GID as part of the template * data */ int ima_eventinodegid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, false); } /* * ima_eventinodemode_init - include the inode mode as part of the template * data */ int ima_eventinodemode_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct inode *inode; u16 mode; if (!event_data->file) return 0; inode = file_inode(event_data->file); mode = inode->i_mode; if (ima_canonical_fmt) mode = (__force u16)cpu_to_le16(mode); return ima_write_template_field_data((char *)&mode, sizeof(mode), DATA_FMT_UINT, field_data); } static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, char type) { u8 *buffer = NULL; int rc; if (!event_data->file) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0, type, ima_canonical_fmt); if (rc < 0) return 0; buffer = kmalloc(rc, GFP_KERNEL); if (!buffer) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer, rc, type, ima_canonical_fmt); if (rc < 0) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX, field_data); out: kfree(buffer); return rc; } /* * ima_eventinodexattrnames_init - include a list of xattr names as part of the * template data */ int ima_eventinodexattrnames_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'n'); } /* * ima_eventinodexattrlengths_init - include a list of xattr lengths as part of * the template data */ int ima_eventinodexattrlengths_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'l'); } /* * ima_eventinodexattrvalues_init - include a list of xattr values as part of * the template data */ int ima_eventinodexattrvalues_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'v'); }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/random.h> #include <linux/export.h> #include "rds.h" /* * All of connection management is simplified by serializing it through * work queues that execute in a connection managing thread. * * TCP wants to send acks through sendpage() in response to data_ready(), * but it needs a process context to do so. * * The receive paths need to allocate but can't drop packets (!) so we have * a thread around to block allocating if the receive fast path sees an * allocation failure. */ /* Grand Unified Theory of connection life cycle: * At any point in time, the connection can be in one of these states: * DOWN, CONNECTING, UP, DISCONNECTING, ERROR * * The following transitions are possible: * ANY -> ERROR * UP -> DISCONNECTING * ERROR -> DISCONNECTING * DISCONNECTING -> DOWN * DOWN -> CONNECTING * CONNECTING -> UP * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport * (TCP, IB/RDMA) to provide the necessary synchronisation. */ struct workqueue_struct *rds_wq; EXPORT_SYMBOL_GPL(rds_wq); void rds_connect_path_complete(struct rds_conn_path *cp, int curr) { if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { printk(KERN_WARNING "%s: Cannot transition to state UP, " "current state is %d\n", __func__, atomic_read(&cp->cp_state)); rds_conn_path_drop(cp, false); return; } rdsdebug("conn %p for %pI6c to %pI6c complete\n", cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) { queue_delayed_work(rds_wq, &cp->cp_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } rcu_read_unlock(); cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; } EXPORT_SYMBOL_GPL(rds_connect_path_complete); void rds_connect_complete(struct rds_connection *conn) { rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); } EXPORT_SYMBOL_GPL(rds_connect_complete); /* * This random exponential backoff is relied on to eventually resolve racing * connects. * * If connect attempts race then both parties drop both connections and come * here to wait for a random amount of time before trying again. Eventually * the backoff range will be so much greater than the time it takes to * establish a connection that one of the pair will establish the connection * before the other's random delay fires. * * Connection attempts that arrive while a connection is already established * are also considered to be racing connects. This lets a connection from * a rebooted machine replace an existing stale connection before the transport * notices that the connection has failed. * * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ void rds_queue_reconnect(struct rds_conn_path *cp) { unsigned long rand; struct rds_connection *conn = cp->cp_conn; rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", conn, &conn->c_laddr, &conn->c_faddr, cp->cp_reconnect_jiffies); /* let peer with smaller addr initiate reconnect, to avoid duels */ if (conn->c_trans->t_type == RDS_TRANS_TCP && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); return; } get_random_bytes(&rand, sizeof(rand)); rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, rand % cp->cp_reconnect_jiffies); rcu_read_unlock(); cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); } void rds_connect_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_conn_w.work); struct rds_connection *conn = cp->cp_conn; int ret; if (cp->cp_index > 0 && rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); if (ret) { ret = conn->c_trans->conn_path_connect(cp); rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { if (rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) rds_queue_reconnect(cp); else rds_conn_path_error(cp, "connect failed\n"); } } } void rds_send_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_send_w.work); int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); ret = rds_send_xmit(cp); cond_resched(); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 2); break; default: break; } } } void rds_recv_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_recv_w.work); int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { ret = cp->cp_conn->c_trans->recv_path(cp); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); break; default: break; } } } void rds_shutdown_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_down_w); rds_conn_shutdown(cp); } void rds_threads_exit(void) { destroy_workqueue(rds_wq); } int rds_threads_init(void) { rds_wq = create_singlethread_workqueue("krdsd"); if (!rds_wq) return -ENOMEM; return 0; } /* Compare two IPv6 addresses. Return 0 if the two addresses are equal. * Return 1 if the first is greater. Return -1 if the second is greater. */ int rds_addr_cmp(const struct in6_addr *addr1, const struct in6_addr *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *a1, *a2; u64 x, y; a1 = (__be64 *)addr1; a2 = (__be64 *)addr2; if (*a1 != *a2) { if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) return -1; else return 1; } else { x = be64_to_cpu(*++a1); y = be64_to_cpu(*++a2); if (x < y) return -1; else if (x > y) return 1; else return 0; } #else u32 a, b; int i; for (i = 0; i < 4; i++) { if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { a = ntohl(addr1->s6_addr32[i]); b = ntohl(addr2->s6_addr32[i]); if (a < b) return -1; else if (a > b) return 1; } } return 0; #endif } EXPORT_SYMBOL_GPL(rds_addr_cmp);
4 4 4 520 187 1 1 4 2 4 6 3 3 10 10 4 3 1 4 5 5 4 2 1 1 1 210 32 1236 1235 187 520 210 292 32 32 412 90 411 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Generic netlink support functions to configure an SMC-R PNET table * * Copyright IBM Corp. 2016 * * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> #include <uapi/linux/if.h> #include <uapi/linux/smc.h> #include <rdma/ib_verbs.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_core.h" static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [SMC_PNETID_IBNAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1 }, [SMC_PNETID_IBPORT] = { .type = NLA_U8 } }; static struct genl_family smc_pnet_nl_family; enum smc_pnet_nametype { SMC_PNET_ETH = 1, SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; enum smc_pnet_nametype type; union { struct { char eth_name[IFNAMSIZ + 1]; struct net_device *ndev; netdevice_tracker dev_tracker; }; struct { char ib_name[IB_DEVICE_NAME_MAX + 1]; u8 ib_port; }; }; }; /* Check if the pnetid is set */ bool smc_pnet_is_pnetid_set(u8 *pnetid) { if (pnetid[0] == 0 || pnetid[0] == _S) return false; return true; } /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { if ((pnetid1[i] == 0 || pnetid1[i] == _S) && (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; } return true; } /* Remove a pnetid from the pnet table. */ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* remove table entry */ mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pr_warn_ratelimited("smc: net device %s " "erased user defined " "pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); } kfree(pnetelem); rc = 0; } } mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { pr_warn_ratelimited("smc: ib device %s ibport " "%d erased user defined " "pnetid %.16s\n", ibdev->ibdev->name, ibport + 1, ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; rc = 0; } } } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; rc = 0; } } mutex_unlock(&smcd_dev_list.mutex); return rc; } /* Add the reference to a given network device to the pnet table. */ static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); pnetelem->ndev = ndev; rc = 0; pr_warn_ratelimited("smc: adding net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Remove the reference to a given network device from the pnet table. */ static int smc_pnet_remove_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pnetelem->ndev = NULL; rc = 0; pr_warn_ratelimited("smc: removing net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Apply pnetid to ib device when no pnetid is set. */ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, char *pnet_name) { bool applied = false; mutex_lock(&smc_ib_devices.mutex); if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } mutex_unlock(&smc_ib_devices.mutex); return applied; } /* Apply pnetid to smcd device when no pnetid is set. */ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) { bool applied = false; mutex_lock(&smcd_dev_list.mutex); if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } mutex_unlock(&smcd_dev_list.mutex); return applied; } /* The limit for pnetid is 16 characters. * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. * Lower case letters are converted to upper case. * Interior blanks should not be used. */ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) { char *bf = skip_spaces(pnet_name); size_t len = strlen(bf); char *end = bf + len; if (!len) return false; while (--end >= bf && isspace(*end)) ; if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) return false; *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; bf++; } *pnetid = '\0'; return true; } /* Find an infiniband device by a given name. The device might not exist. */ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || (ibdev->ibdev->dev.parent && !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: mutex_unlock(&smc_ib_devices.mutex); return ibdev; } /* Find an smcd device by a given name. The device might not exist. */ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1) || (smcd_dev->dibs->dev.parent && !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; out: mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, char *eth_name, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct net_device *ndev, *base_ndev; u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; bool new_netdev; int rc; /* check if (base) netdev already has a pnetid. If there is one, we do * not want to add a pnet table entry */ rc = -EEXIST; ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ if (ndev) { base_ndev = pnet_find_base_ndev(ndev); if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid)) goto out_put; } /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strscpy(new_pe->eth_name, eth_name); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { new_netdev = false; break; } } if (new_netdev) { if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } if (ndev) pr_warn_ratelimited("smc: net device %s " "applied user defined pnetid %.16s\n", new_pe->eth_name, new_pe->pnet_name); return 0; out_put: dev_put(ndev); return rc; } static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, u8 ib_port, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; bool new_ibdev; /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); if (ibdev_applied) pr_warn_ratelimited("smc: ib device %s ibport %d " "applied user defined pnetid " "%.16s\n", ib_dev->ibdev->name, ib_port, ib_dev->pnetid[ib_port - 1]); } smcd = smc_pnet_find_smcd(ib_name); if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", dev_name(&smcd->dibs->dev), smcd->pnetid); } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ if (!ibdev_applied || !smcddev_applied) return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { new_ibdev = false; break; } } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; } /* Append a pnetid to the end of the pnet table if not already on this list. */ static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct smc_pnettable *pnettable; bool new_netdev = false; bool new_ibdev = false; struct smc_net *sn; u8 ibport = 1; char *string; int rc; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); if (!smc_pnetid_valid(string, pnet_name)) goto error; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); if (!rc) new_netdev = true; else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); string = strim(string); if (tb[SMC_PNETID_IBPORT]) { ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); if (!rc) new_ibdev = true; else if (rc != -EEXIST) goto error; } return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; } /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } if (pnetelem->type == SMC_PNET_IB) { if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) return -1; } return 0; } static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; return smc_pnet_remove_by_pnetid(net, (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } static int smc_pnet_dump_start(struct netlink_callback *cb) { cb->args[0] = 0; return 0; } static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, struct smc_pnetentry *pnetelem) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, flags, SMC_PNETID_GET); if (!hdr) return -ENOMEM; if (smc_pnet_set_nla(skb, pnetelem) < 0) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; struct smc_net *sn; int idx = 0; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* dump pnettable entries */ mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; /* if this is not the initial namespace, dump only netdev */ if (net != &init_net && pnetelem->type != SMC_PNET_ETH) continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, pnetelem)) { --idx; break; } } mutex_unlock(&pnettable->lock); return idx; } static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx; idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NULL, cb->args[0]); cb->args[0] = idx; return skb->len; } /* Retrieve one PNETID entry */ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct sk_buff *msg; void *hdr; if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, nla_data(info->attrs[SMC_PNETID_NAME]), 0); /* finish multi part message and send it */ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, NLM_F_MULTI); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } return genlmsg_reply(msg, info); } /* Remove and delete all pnetids from pnet table. */ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); smc_pnet_remove_by_pnetid(net, NULL); return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_flush } }; /* SMC_PNETID family definition */ static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, .n_ops = ARRAY_SIZE(smc_pnet_ops), .resv_start_op = SMC_PNETID_FLUSH + 1, }; bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe; bool rc = false; read_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { rc = true; goto unlock; } } unlock: read_unlock(&sn->pnetids_ndev.lock); return rc; } static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!pe) return -ENOMEM; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; } } refcount_set(&pe->refcnt, 1); memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); list_add_tail(&pe->list, &sn->pnetids_ndev.list); unlock: write_unlock(&sn->pnetids_ndev.lock); return 0; } static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pe2; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { if (refcount_dec_and_test(&pe->refcnt)) { list_del(&pe->list); kfree(pe); } break; } } write_unlock(&sn->pnetids_ndev.lock); } static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, u8 *ndev_pnetid) { struct net_device *base_dev; base_dev = __pnet_find_base_ndev(dev); if (base_dev->flags & IFF_UP && !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, ndev_pnetid)) { /* add to PNETIDs list */ smc_pnet_add_pnetid(net, ndev_pnetid); } } /* create initial list of netdevice pnetids */ static void smc_pnet_create_pnetids_list(struct net *net) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; /* Newly created netns do not have devices. * Do not even acquire rtnl. */ if (list_empty(&net->dev_base_head)) return; /* Note: This might not be needed, because smc_pnet_netdev_event() * is also calling smc_pnet_add_base_pnetid() when handling * NETDEV_UP event. */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); rtnl_unlock(); } /* clean up list of netdevice pnetids */ static void smc_pnet_destroy_pnetids_list(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *temp_pe; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { list_del(&pe->list); kfree(pe); } write_unlock(&sn->pnetids_ndev.lock); } static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(event_dev); u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); return NOTIFY_OK; case NETDEV_DOWN: event_dev = __pnet_find_base_ndev(event_dev); if (!smc_pnetid_by_dev_port(event_dev->dev.parent, event_dev->dev_port, ndev_pnetid)) { /* remove from PNETIDs list */ smc_pnet_remove_pnetid(net, ndev_pnetid); } return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block smc_netdev_notifier = { .notifier_call = smc_pnet_netdev_event }; /* init network namespace */ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); smc_pnet_create_pnetids_list(net); return 0; } int __init smc_pnet_init(void) { int rc; rc = genl_register_family(&smc_pnet_nl_family); if (rc) return rc; rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); return rc; } /* exit network namespace */ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) { unregister_netdevice_notifier(&smc_netdev_notifier); genl_unregister_family(&smc_pnet_nl_family); } static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } return ndev; } /* Determine one base device for stacked net devices. * If the lower device level contains more than one devices * (for instance with bonding slaves), just the first device * is used to reach a base device. */ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { rtnl_lock(); ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, u8 *pnetid) { struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_pnetentry *pnetelem; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, struct smc_init_info *ini) { if (!ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, NULL)) { ini->ib_dev = ibdev; ini->ib_port = i; return 0; } if (ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, NULL, &ini->smcrv2)) { ini->smcrv2.ib_dev_v2 = ibdev; ini->smcrv2.ib_port_v2 = i; return 0; } return -ENODEV; } /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, struct smc_ib_device *known_dev, struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev || !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) goto out; } } } out: mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { struct net *net = lgr->net; _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; /* check rdma net namespace */ if (!rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) break; } } } mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. * Searching stops at the first matching active IB device port with vlan_id * configured. * If nothing found, check pnetid table. * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *base_ndev; struct net *net; base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && (!ini->ism_peer_gid[0].gid || !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; } } mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct net_device *dev; struct dst_entry *dst; rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; dev_hold(dev); rcu_read_unlock(); if (dev) { smc_pnet_find_roce_by_pnetid(dev, ini); dev_put(dev); } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct net_device *dev; struct dst_entry *dst; ini->ism_dev[0] = NULL; rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; dev_hold(dev); rcu_read_unlock(); if (dev) { smc_pnet_find_ism_by_pnetid(dev, ini); dev_put(dev); } } /* Lookup and apply a pnet table entry to the given ib device. */ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) { char *ib_name = smcibdev->ibdev->name; struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && tmp_pe->ib_port == ib_port) { smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } /* Lookup and apply a pnet table entry to the given smcd device. */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && (!strncmp(tmp_pe->ib_name, dev_name(&smcddev->dibs->dev), sizeof(tmp_pe->ib_name)) || (smcddev->dibs->dev.parent && !strncmp(tmp_pe->ib_name, dev_name(smcddev->dibs->dev.parent), sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; }
39 39 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_free_vmas() * * tlb_free_vmas() marks the start of unlinking of one or more vmas * and freeing page-tables. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / tlb_remove_page_size() * - __tlb_remove_folio_pages() / __tlb_remove_page_size() * - __tlb_remove_folio_pages_size() * * __tlb_remove_folio_pages_size() is the basic primitive that queues pages * for freeing. It will return a boolean indicating if the queue is (now) * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), * however, instead of removing a single page, assume PAGE_SIZE and remove * the given number of consecutive pages that are all part of the * same (large) folio. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef __HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor(ptdesc); tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); /* * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma * in the tracked range, see tlb_free_vmas(). */ tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the * page mapcount -- there might not be page-frames for these PFNs * after all. * * Specifically() there is a race between munmap() and * unmap_mapping_range(), where munmap() will unlink the VMA, such * that unmap_mapping_range() will no longer observe the VMA and * no-op, without observing the TLBI, returning prematurely. * * So if we're about to unlink such a VMA, and we have pending * TLBI for such a vma, flush things now. */ if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); } /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) { } #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for * later tlb invalidation. * * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple * consecutive ptes instead of only a single one. */ static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, pte_t *ptep, unsigned int nr, unsigned long address) { tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); for (;;) { __tlb_remove_tlb_entry(tlb, ptep, address); if (--nr == 0) break; ptep++; address += PAGE_SIZE; } } #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz >= P4D_SIZE) \ tlb_flush_p4d_range(tlb, address, _sz); \ else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ else if (_sz >= PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else \ tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef pte_needs_flush static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { return true; } #endif #ifndef huge_pmd_needs_flush static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { return true; } #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/kmemleak.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* * Recalculate wakeup batch when tag is shared by hctx. */ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, unsigned int users) { if (!users) return; sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, users); sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, users); } /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, * it avoids dirtying the cacheline if the queue is already active. */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irqrestore(&tags->lock, flags); } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues - 1; WRITE_ONCE(tags->active_queues, users);